diff --git a/CMakeLists.txt b/CMakeLists.txt index d858feb..412dea9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,15 +18,13 @@ add_library(memcpy STATIC "${LIBMEMCPY_PLATFORM_FILES}" # BEGIN GENERATED FILES src/names-memcpy.c - impls/memcpy-ssse3-back.s - impls/memcpy-ssse3.s impls/memmove-avx-unaligned-erms-rtm.s impls/memmove-avx-unaligned-erms.s impls/memmove-avx512-no-vzeroupper.s impls/memmove-avx512-unaligned-erms.s + impls/memmove-erms.s impls/memmove-evex-unaligned-erms.s impls/memmove-sse2-unaligned-erms.s - impls/memmove-ssse3-back.s impls/memmove-ssse3.s # END GENERATED FILES ) diff --git a/impls/memcpy-ssse3-back.s b/impls/memcpy-ssse3-back.s deleted file mode 100644 index 698258e..0000000 --- a/impls/memcpy-ssse3-back.s +++ /dev/null @@ -1,3067 +0,0 @@ - .section .text.ssse3 - -.globl __mempcpy_ssse3_back -__mempcpy_ssse3_back: - mov %rdi, %rax - add %rdx, %rax - jmp .Lstart - -.globl __memcpy_ssse3_back -__memcpy_ssse3_back: - mov %rdi, %rax -.Lstart: - cmp $144, %rdx - jae .L144bytesormore - -.Lfwd_write_less32bytes: - - cmp %dil, %sil - jbe .Lbk_write - - add %rdx, %rsi - add %rdx, %rdi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - -.Lbk_write: - - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.L144bytesormore: - - cmp %dil, %sil - jle .Lcopy_backward - - movdqu (%rsi), %xmm0 - mov %rdi, %r8 - and $-16, %rdi - add $16, %rdi - mov %rdi, %r9 - sub %r8, %r9 - sub %r9, %rdx - add %r9, %rsi - mov %rsi, %r9 - and $0xf, %r9 - jz .Lshl_0 - - mov __x86_data_cache_size(%rip), %rcx - - cmp %rcx, %rdx - jae .Lgobble_mem_fwd - lea .Lshl_table_fwd(%rip), %r11 - sub $0x80, %rdx - movslq (%r11, %r9, 4), %r9 - add %r11, %r9 - jmp *%r9 - ud2 - - .p2align 4 -.Lcopy_backward: - - mov __x86_data_cache_size(%rip), %rcx - - shl $1, %rcx - cmp %rcx, %rdx - ja .Lgobble_mem_bwd - - add %rdx, %rdi - add %rdx, %rsi - movdqu -16(%rsi), %xmm0 - lea -16(%rdi), %r8 - mov %rdi, %r9 - and $0xf, %r9 - xor %r9, %rdi - sub %r9, %rsi - sub %r9, %rdx - mov %rsi, %r9 - and $0xf, %r9 - jz .Lshl_0_bwd - lea .Lshl_table_bwd(%rip), %r11 - sub $0x80, %rdx - movslq (%r11, %r9, 4), %r9 - add %r11, %r9 - jmp *%r9 - ud2 - - .p2align 4 -.Lshl_0: - - mov %rdx, %r9 - shr $8, %r9 - add %rdx, %r9 - - cmp __x86_data_cache_size_half(%rip), %r9 - - jae .Lgobble_mem_fwd - sub $0x80, %rdx - .p2align 4 -.Lshl_0_loop: - movdqa (%rsi), %xmm1 - movdqa %xmm1, (%rdi) - movaps 0x10(%rsi), %xmm2 - movaps %xmm2, 0x10(%rdi) - movaps 0x20(%rsi), %xmm3 - movaps %xmm3, 0x20(%rdi) - movaps 0x30(%rsi), %xmm4 - movaps %xmm4, 0x30(%rdi) - movaps 0x40(%rsi), %xmm1 - movaps %xmm1, 0x40(%rdi) - movaps 0x50(%rsi), %xmm2 - movaps %xmm2, 0x50(%rdi) - movaps 0x60(%rsi), %xmm3 - movaps %xmm3, 0x60(%rdi) - movaps 0x70(%rsi), %xmm4 - movaps %xmm4, 0x70(%rdi) - sub $0x80, %rdx - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae .Lshl_0_loop - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rsi - add %rdx, %rdi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_0_bwd: - sub $0x80, %rdx -.Lcopy_backward_loop: - movaps -0x10(%rsi), %xmm1 - movaps %xmm1, -0x10(%rdi) - movaps -0x20(%rsi), %xmm2 - movaps %xmm2, -0x20(%rdi) - movaps -0x30(%rsi), %xmm3 - movaps %xmm3, -0x30(%rdi) - movaps -0x40(%rsi), %xmm4 - movaps %xmm4, -0x40(%rdi) - movaps -0x50(%rsi), %xmm5 - movaps %xmm5, -0x50(%rdi) - movaps -0x60(%rsi), %xmm5 - movaps %xmm5, -0x60(%rdi) - movaps -0x70(%rsi), %xmm5 - movaps %xmm5, -0x70(%rdi) - movaps -0x80(%rsi), %xmm5 - movaps %xmm5, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lcopy_backward_loop - - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_1: - sub $0x80, %rdx - movaps -0x01(%rsi), %xmm1 - movaps 0x0f(%rsi), %xmm2 - movaps 0x1f(%rsi), %xmm3 - movaps 0x2f(%rsi), %xmm4 - movaps 0x3f(%rsi), %xmm5 - movaps 0x4f(%rsi), %xmm6 - movaps 0x5f(%rsi), %xmm7 - movaps 0x6f(%rsi), %xmm8 - movaps 0x7f(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $1, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $1, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $1, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $1, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $1, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $1, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $1, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_1 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_1_bwd: - movaps -0x01(%rsi), %xmm1 - - movaps -0x11(%rsi), %xmm2 - palignr $1, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x21(%rsi), %xmm3 - palignr $1, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x31(%rsi), %xmm4 - palignr $1, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x41(%rsi), %xmm5 - palignr $1, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x51(%rsi), %xmm6 - palignr $1, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x61(%rsi), %xmm7 - palignr $1, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x71(%rsi), %xmm8 - palignr $1, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x81(%rsi), %xmm9 - palignr $1, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_1_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_2: - sub $0x80, %rdx - movaps -0x02(%rsi), %xmm1 - movaps 0x0e(%rsi), %xmm2 - movaps 0x1e(%rsi), %xmm3 - movaps 0x2e(%rsi), %xmm4 - movaps 0x3e(%rsi), %xmm5 - movaps 0x4e(%rsi), %xmm6 - movaps 0x5e(%rsi), %xmm7 - movaps 0x6e(%rsi), %xmm8 - movaps 0x7e(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $2, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $2, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $2, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $2, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $2, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $2, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $2, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_2 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_2_bwd: - movaps -0x02(%rsi), %xmm1 - - movaps -0x12(%rsi), %xmm2 - palignr $2, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x22(%rsi), %xmm3 - palignr $2, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x32(%rsi), %xmm4 - palignr $2, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x42(%rsi), %xmm5 - palignr $2, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x52(%rsi), %xmm6 - palignr $2, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x62(%rsi), %xmm7 - palignr $2, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x72(%rsi), %xmm8 - palignr $2, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x82(%rsi), %xmm9 - palignr $2, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_2_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_3: - sub $0x80, %rdx - movaps -0x03(%rsi), %xmm1 - movaps 0x0d(%rsi), %xmm2 - movaps 0x1d(%rsi), %xmm3 - movaps 0x2d(%rsi), %xmm4 - movaps 0x3d(%rsi), %xmm5 - movaps 0x4d(%rsi), %xmm6 - movaps 0x5d(%rsi), %xmm7 - movaps 0x6d(%rsi), %xmm8 - movaps 0x7d(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $3, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $3, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $3, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $3, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $3, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $3, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $3, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_3 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_3_bwd: - movaps -0x03(%rsi), %xmm1 - - movaps -0x13(%rsi), %xmm2 - palignr $3, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x23(%rsi), %xmm3 - palignr $3, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x33(%rsi), %xmm4 - palignr $3, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x43(%rsi), %xmm5 - palignr $3, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x53(%rsi), %xmm6 - palignr $3, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x63(%rsi), %xmm7 - palignr $3, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x73(%rsi), %xmm8 - palignr $3, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x83(%rsi), %xmm9 - palignr $3, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_3_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_4: - sub $0x80, %rdx - movaps -0x04(%rsi), %xmm1 - movaps 0x0c(%rsi), %xmm2 - movaps 0x1c(%rsi), %xmm3 - movaps 0x2c(%rsi), %xmm4 - movaps 0x3c(%rsi), %xmm5 - movaps 0x4c(%rsi), %xmm6 - movaps 0x5c(%rsi), %xmm7 - movaps 0x6c(%rsi), %xmm8 - movaps 0x7c(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $4, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $4, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $4, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $4, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $4, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $4, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $4, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_4 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_4_bwd: - movaps -0x04(%rsi), %xmm1 - - movaps -0x14(%rsi), %xmm2 - palignr $4, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x24(%rsi), %xmm3 - palignr $4, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x34(%rsi), %xmm4 - palignr $4, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x44(%rsi), %xmm5 - palignr $4, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x54(%rsi), %xmm6 - palignr $4, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x64(%rsi), %xmm7 - palignr $4, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x74(%rsi), %xmm8 - palignr $4, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x84(%rsi), %xmm9 - palignr $4, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_4_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_5: - sub $0x80, %rdx - movaps -0x05(%rsi), %xmm1 - movaps 0x0b(%rsi), %xmm2 - movaps 0x1b(%rsi), %xmm3 - movaps 0x2b(%rsi), %xmm4 - movaps 0x3b(%rsi), %xmm5 - movaps 0x4b(%rsi), %xmm6 - movaps 0x5b(%rsi), %xmm7 - movaps 0x6b(%rsi), %xmm8 - movaps 0x7b(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $5, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $5, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $5, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $5, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $5, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $5, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $5, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_5 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_5_bwd: - movaps -0x05(%rsi), %xmm1 - - movaps -0x15(%rsi), %xmm2 - palignr $5, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x25(%rsi), %xmm3 - palignr $5, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x35(%rsi), %xmm4 - palignr $5, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x45(%rsi), %xmm5 - palignr $5, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x55(%rsi), %xmm6 - palignr $5, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x65(%rsi), %xmm7 - palignr $5, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x75(%rsi), %xmm8 - palignr $5, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x85(%rsi), %xmm9 - palignr $5, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_5_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_6: - sub $0x80, %rdx - movaps -0x06(%rsi), %xmm1 - movaps 0x0a(%rsi), %xmm2 - movaps 0x1a(%rsi), %xmm3 - movaps 0x2a(%rsi), %xmm4 - movaps 0x3a(%rsi), %xmm5 - movaps 0x4a(%rsi), %xmm6 - movaps 0x5a(%rsi), %xmm7 - movaps 0x6a(%rsi), %xmm8 - movaps 0x7a(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $6, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $6, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $6, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $6, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $6, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $6, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $6, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_6 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_6_bwd: - movaps -0x06(%rsi), %xmm1 - - movaps -0x16(%rsi), %xmm2 - palignr $6, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x26(%rsi), %xmm3 - palignr $6, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x36(%rsi), %xmm4 - palignr $6, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x46(%rsi), %xmm5 - palignr $6, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x56(%rsi), %xmm6 - palignr $6, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x66(%rsi), %xmm7 - palignr $6, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x76(%rsi), %xmm8 - palignr $6, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x86(%rsi), %xmm9 - palignr $6, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_6_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_7: - sub $0x80, %rdx - movaps -0x07(%rsi), %xmm1 - movaps 0x09(%rsi), %xmm2 - movaps 0x19(%rsi), %xmm3 - movaps 0x29(%rsi), %xmm4 - movaps 0x39(%rsi), %xmm5 - movaps 0x49(%rsi), %xmm6 - movaps 0x59(%rsi), %xmm7 - movaps 0x69(%rsi), %xmm8 - movaps 0x79(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $7, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $7, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $7, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $7, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $7, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $7, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $7, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_7 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_7_bwd: - movaps -0x07(%rsi), %xmm1 - - movaps -0x17(%rsi), %xmm2 - palignr $7, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x27(%rsi), %xmm3 - palignr $7, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x37(%rsi), %xmm4 - palignr $7, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x47(%rsi), %xmm5 - palignr $7, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x57(%rsi), %xmm6 - palignr $7, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x67(%rsi), %xmm7 - palignr $7, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x77(%rsi), %xmm8 - palignr $7, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x87(%rsi), %xmm9 - palignr $7, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_7_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_8: - sub $0x80, %rdx - movaps -0x08(%rsi), %xmm1 - movaps 0x08(%rsi), %xmm2 - movaps 0x18(%rsi), %xmm3 - movaps 0x28(%rsi), %xmm4 - movaps 0x38(%rsi), %xmm5 - movaps 0x48(%rsi), %xmm6 - movaps 0x58(%rsi), %xmm7 - movaps 0x68(%rsi), %xmm8 - movaps 0x78(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $8, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $8, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $8, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $8, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $8, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $8, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $8, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_8 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_8_bwd: - movaps -0x08(%rsi), %xmm1 - - movaps -0x18(%rsi), %xmm2 - palignr $8, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x28(%rsi), %xmm3 - palignr $8, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x38(%rsi), %xmm4 - palignr $8, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x48(%rsi), %xmm5 - palignr $8, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x58(%rsi), %xmm6 - palignr $8, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x68(%rsi), %xmm7 - palignr $8, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x78(%rsi), %xmm8 - palignr $8, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x88(%rsi), %xmm9 - palignr $8, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_8_bwd -.Lshl_8_end_bwd: - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_9: - sub $0x80, %rdx - movaps -0x09(%rsi), %xmm1 - movaps 0x07(%rsi), %xmm2 - movaps 0x17(%rsi), %xmm3 - movaps 0x27(%rsi), %xmm4 - movaps 0x37(%rsi), %xmm5 - movaps 0x47(%rsi), %xmm6 - movaps 0x57(%rsi), %xmm7 - movaps 0x67(%rsi), %xmm8 - movaps 0x77(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $9, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $9, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $9, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $9, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $9, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $9, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $9, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_9 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_9_bwd: - movaps -0x09(%rsi), %xmm1 - - movaps -0x19(%rsi), %xmm2 - palignr $9, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x29(%rsi), %xmm3 - palignr $9, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x39(%rsi), %xmm4 - palignr $9, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x49(%rsi), %xmm5 - palignr $9, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x59(%rsi), %xmm6 - palignr $9, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x69(%rsi), %xmm7 - palignr $9, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x79(%rsi), %xmm8 - palignr $9, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x89(%rsi), %xmm9 - palignr $9, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_9_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_10: - sub $0x80, %rdx - movaps -0x0a(%rsi), %xmm1 - movaps 0x06(%rsi), %xmm2 - movaps 0x16(%rsi), %xmm3 - movaps 0x26(%rsi), %xmm4 - movaps 0x36(%rsi), %xmm5 - movaps 0x46(%rsi), %xmm6 - movaps 0x56(%rsi), %xmm7 - movaps 0x66(%rsi), %xmm8 - movaps 0x76(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $10, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $10, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $10, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $10, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $10, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $10, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $10, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_10 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_10_bwd: - movaps -0x0a(%rsi), %xmm1 - - movaps -0x1a(%rsi), %xmm2 - palignr $10, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2a(%rsi), %xmm3 - palignr $10, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3a(%rsi), %xmm4 - palignr $10, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4a(%rsi), %xmm5 - palignr $10, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5a(%rsi), %xmm6 - palignr $10, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6a(%rsi), %xmm7 - palignr $10, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7a(%rsi), %xmm8 - palignr $10, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8a(%rsi), %xmm9 - palignr $10, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_10_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_11: - sub $0x80, %rdx - movaps -0x0b(%rsi), %xmm1 - movaps 0x05(%rsi), %xmm2 - movaps 0x15(%rsi), %xmm3 - movaps 0x25(%rsi), %xmm4 - movaps 0x35(%rsi), %xmm5 - movaps 0x45(%rsi), %xmm6 - movaps 0x55(%rsi), %xmm7 - movaps 0x65(%rsi), %xmm8 - movaps 0x75(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $11, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $11, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $11, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $11, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $11, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $11, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $11, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_11 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_11_bwd: - movaps -0x0b(%rsi), %xmm1 - - movaps -0x1b(%rsi), %xmm2 - palignr $11, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2b(%rsi), %xmm3 - palignr $11, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3b(%rsi), %xmm4 - palignr $11, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4b(%rsi), %xmm5 - palignr $11, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5b(%rsi), %xmm6 - palignr $11, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6b(%rsi), %xmm7 - palignr $11, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7b(%rsi), %xmm8 - palignr $11, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8b(%rsi), %xmm9 - palignr $11, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_11_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_12: - sub $0x80, %rdx - movdqa -0x0c(%rsi), %xmm1 - movaps 0x04(%rsi), %xmm2 - movaps 0x14(%rsi), %xmm3 - movaps 0x24(%rsi), %xmm4 - movaps 0x34(%rsi), %xmm5 - movaps 0x44(%rsi), %xmm6 - movaps 0x54(%rsi), %xmm7 - movaps 0x64(%rsi), %xmm8 - movaps 0x74(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $12, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $12, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $12, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $12, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $12, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $12, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $12, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - - lea 0x80(%rdi), %rdi - jae .Lshl_12 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_12_bwd: - movaps -0x0c(%rsi), %xmm1 - - movaps -0x1c(%rsi), %xmm2 - palignr $12, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2c(%rsi), %xmm3 - palignr $12, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3c(%rsi), %xmm4 - palignr $12, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4c(%rsi), %xmm5 - palignr $12, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5c(%rsi), %xmm6 - palignr $12, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6c(%rsi), %xmm7 - palignr $12, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7c(%rsi), %xmm8 - palignr $12, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8c(%rsi), %xmm9 - palignr $12, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_12_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_13: - sub $0x80, %rdx - movaps -0x0d(%rsi), %xmm1 - movaps 0x03(%rsi), %xmm2 - movaps 0x13(%rsi), %xmm3 - movaps 0x23(%rsi), %xmm4 - movaps 0x33(%rsi), %xmm5 - movaps 0x43(%rsi), %xmm6 - movaps 0x53(%rsi), %xmm7 - movaps 0x63(%rsi), %xmm8 - movaps 0x73(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $13, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $13, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $13, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $13, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $13, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $13, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $13, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_13 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_13_bwd: - movaps -0x0d(%rsi), %xmm1 - - movaps -0x1d(%rsi), %xmm2 - palignr $13, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2d(%rsi), %xmm3 - palignr $13, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3d(%rsi), %xmm4 - palignr $13, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4d(%rsi), %xmm5 - palignr $13, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5d(%rsi), %xmm6 - palignr $13, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6d(%rsi), %xmm7 - palignr $13, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7d(%rsi), %xmm8 - palignr $13, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8d(%rsi), %xmm9 - palignr $13, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_13_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_14: - sub $0x80, %rdx - movaps -0x0e(%rsi), %xmm1 - movaps 0x02(%rsi), %xmm2 - movaps 0x12(%rsi), %xmm3 - movaps 0x22(%rsi), %xmm4 - movaps 0x32(%rsi), %xmm5 - movaps 0x42(%rsi), %xmm6 - movaps 0x52(%rsi), %xmm7 - movaps 0x62(%rsi), %xmm8 - movaps 0x72(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $14, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $14, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $14, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $14, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $14, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $14, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $14, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_14 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_14_bwd: - movaps -0x0e(%rsi), %xmm1 - - movaps -0x1e(%rsi), %xmm2 - palignr $14, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2e(%rsi), %xmm3 - palignr $14, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3e(%rsi), %xmm4 - palignr $14, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4e(%rsi), %xmm5 - palignr $14, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5e(%rsi), %xmm6 - palignr $14, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6e(%rsi), %xmm7 - palignr $14, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7e(%rsi), %xmm8 - palignr $14, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8e(%rsi), %xmm9 - palignr $14, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_14_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_15: - sub $0x80, %rdx - movaps -0x0f(%rsi), %xmm1 - movaps 0x01(%rsi), %xmm2 - movaps 0x11(%rsi), %xmm3 - movaps 0x21(%rsi), %xmm4 - movaps 0x31(%rsi), %xmm5 - movaps 0x41(%rsi), %xmm6 - movaps 0x51(%rsi), %xmm7 - movaps 0x61(%rsi), %xmm8 - movaps 0x71(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $15, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $15, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $15, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $15, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $15, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $15, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $15, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_15 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_15_bwd: - movaps -0x0f(%rsi), %xmm1 - - movaps -0x1f(%rsi), %xmm2 - palignr $15, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2f(%rsi), %xmm3 - palignr $15, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3f(%rsi), %xmm4 - palignr $15, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4f(%rsi), %xmm5 - palignr $15, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5f(%rsi), %xmm6 - palignr $15, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6f(%rsi), %xmm7 - palignr $15, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7f(%rsi), %xmm8 - palignr $15, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8f(%rsi), %xmm9 - palignr $15, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_15_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lgobble_mem_fwd: - movdqu (%rsi), %xmm1 - movdqu %xmm0, (%r8) - movdqa %xmm1, (%rdi) - sub $16, %rdx - add $16, %rsi - add $16, %rdi - - mov __x86_shared_cache_size_half(%rip), %rcx - cmp %rcx, %rdx - ja .Lbigger_in_fwd - mov %rdx, %rcx -.Lbigger_in_fwd: - sub %rcx, %rdx - cmp $0x1000, %rdx - jbe .Lll_cache_copy_fwd - - mov %rcx, %r9 - shl $3, %r9 - cmp %r9, %rdx - jbe .L2steps_copy_fwd - add %rcx, %rdx - xor %rcx, %rcx -.L2steps_copy_fwd: - sub $0x80, %rdx -.Lgobble_mem_fwd_loop: - sub $0x80, %rdx - prefetcht0 0x200(%rsi) - prefetcht0 0x300(%rsi) - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lfence - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - movntdq %xmm4, 0x40(%rdi) - movntdq %xmm5, 0x50(%rdi) - movntdq %xmm6, 0x60(%rdi) - movntdq %xmm7, 0x70(%rdi) - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae .Lgobble_mem_fwd_loop - sfence - cmp $0x80, %rcx - jb .Lgobble_mem_fwd_end - add $0x80, %rdx -.Lll_cache_copy_fwd: - add %rcx, %rdx -.Lll_cache_copy_fwd_start: - sub $0x80, %rdx -.Lgobble_ll_loop_fwd: - prefetchnta 0x1c0(%rsi) - prefetchnta 0x280(%rsi) - prefetchnta 0x1c0(%rdi) - prefetchnta 0x280(%rdi) - sub $0x80, %rdx - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - movdqa %xmm2, 0x20(%rdi) - movdqa %xmm3, 0x30(%rdi) - movdqa %xmm4, 0x40(%rdi) - movdqa %xmm5, 0x50(%rdi) - movdqa %xmm6, 0x60(%rdi) - movdqa %xmm7, 0x70(%rdi) - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae .Lgobble_ll_loop_fwd -.Lgobble_mem_fwd_end: - add $0x80, %rdx - add %rdx, %rsi - add %rdx, %rdi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lgobble_mem_bwd: - add %rdx, %rsi - add %rdx, %rdi - - movdqu -16(%rsi), %xmm0 - lea -16(%rdi), %r8 - mov %rdi, %r9 - and $-16, %rdi - sub %rdi, %r9 - sub %r9, %rsi - sub %r9, %rdx - - mov __x86_shared_cache_size_half(%rip), %rcx - cmp %rcx, %rdx - ja .Lbigger - mov %rdx, %rcx -.Lbigger: - sub %rcx, %rdx - cmp $0x1000, %rdx - jbe .Lll_cache_copy - - mov %rcx, %r9 - shl $3, %r9 - cmp %r9, %rdx - jbe .L2steps_copy - add %rcx, %rdx - xor %rcx, %rcx -.L2steps_copy: - sub $0x80, %rdx -.Lgobble_mem_bwd_loop: - sub $0x80, %rdx - prefetcht0 -0x200(%rsi) - prefetcht0 -0x300(%rsi) - movdqu -0x10(%rsi), %xmm1 - movdqu -0x20(%rsi), %xmm2 - movdqu -0x30(%rsi), %xmm3 - movdqu -0x40(%rsi), %xmm4 - movdqu -0x50(%rsi), %xmm5 - movdqu -0x60(%rsi), %xmm6 - movdqu -0x70(%rsi), %xmm7 - movdqu -0x80(%rsi), %xmm8 - lfence - movntdq %xmm1, -0x10(%rdi) - movntdq %xmm2, -0x20(%rdi) - movntdq %xmm3, -0x30(%rdi) - movntdq %xmm4, -0x40(%rdi) - movntdq %xmm5, -0x50(%rdi) - movntdq %xmm6, -0x60(%rdi) - movntdq %xmm7, -0x70(%rdi) - movntdq %xmm8, -0x80(%rdi) - lea -0x80(%rsi), %rsi - lea -0x80(%rdi), %rdi - jae .Lgobble_mem_bwd_loop - sfence - cmp $0x80, %rcx - jb .Lgobble_mem_bwd_end - add $0x80, %rdx -.Lll_cache_copy: - add %rcx, %rdx -.Lll_cache_copy_bwd_start: - sub $0x80, %rdx -.Lgobble_ll_loop: - prefetchnta -0x1c0(%rsi) - prefetchnta -0x280(%rsi) - prefetchnta -0x1c0(%rdi) - prefetchnta -0x280(%rdi) - sub $0x80, %rdx - movdqu -0x10(%rsi), %xmm1 - movdqu -0x20(%rsi), %xmm2 - movdqu -0x30(%rsi), %xmm3 - movdqu -0x40(%rsi), %xmm4 - movdqu -0x50(%rsi), %xmm5 - movdqu -0x60(%rsi), %xmm6 - movdqu -0x70(%rsi), %xmm7 - movdqu -0x80(%rsi), %xmm8 - movdqa %xmm1, -0x10(%rdi) - movdqa %xmm2, -0x20(%rdi) - movdqa %xmm3, -0x30(%rdi) - movdqa %xmm4, -0x40(%rdi) - movdqa %xmm5, -0x50(%rdi) - movdqa %xmm6, -0x60(%rdi) - movdqa %xmm7, -0x70(%rdi) - movdqa %xmm8, -0x80(%rdi) - lea -0x80(%rsi), %rsi - lea -0x80(%rdi), %rdi - jae .Lgobble_ll_loop -.Lgobble_mem_bwd_end: - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rsi - sub %rdx, %rdi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lfwd_write_128bytes: - lddqu -128(%rsi), %xmm0 - movdqu %xmm0, -128(%rdi) -.Lfwd_write_112bytes: - lddqu -112(%rsi), %xmm0 - movdqu %xmm0, -112(%rdi) -.Lfwd_write_96bytes: - lddqu -96(%rsi), %xmm0 - movdqu %xmm0, -96(%rdi) -.Lfwd_write_80bytes: - lddqu -80(%rsi), %xmm0 - movdqu %xmm0, -80(%rdi) -.Lfwd_write_64bytes: - lddqu -64(%rsi), %xmm0 - movdqu %xmm0, -64(%rdi) -.Lfwd_write_48bytes: - lddqu -48(%rsi), %xmm0 - movdqu %xmm0, -48(%rdi) -.Lfwd_write_32bytes: - lddqu -32(%rsi), %xmm0 - movdqu %xmm0, -32(%rdi) -.Lfwd_write_16bytes: - lddqu -16(%rsi), %xmm0 - movdqu %xmm0, -16(%rdi) -.Lfwd_write_0bytes: - ret - - .p2align 4 -.Lfwd_write_143bytes: - lddqu -143(%rsi), %xmm0 - movdqu %xmm0, -143(%rdi) -.Lfwd_write_127bytes: - lddqu -127(%rsi), %xmm0 - movdqu %xmm0, -127(%rdi) -.Lfwd_write_111bytes: - lddqu -111(%rsi), %xmm0 - movdqu %xmm0, -111(%rdi) -.Lfwd_write_95bytes: - lddqu -95(%rsi), %xmm0 - movdqu %xmm0, -95(%rdi) -.Lfwd_write_79bytes: - lddqu -79(%rsi), %xmm0 - movdqu %xmm0, -79(%rdi) -.Lfwd_write_63bytes: - lddqu -63(%rsi), %xmm0 - movdqu %xmm0, -63(%rdi) -.Lfwd_write_47bytes: - lddqu -47(%rsi), %xmm0 - movdqu %xmm0, -47(%rdi) -.Lfwd_write_31bytes: - lddqu -31(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -31(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_15bytes: - mov -15(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -15(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -.Lfwd_write_142bytes: - lddqu -142(%rsi), %xmm0 - movdqu %xmm0, -142(%rdi) -.Lfwd_write_126bytes: - lddqu -126(%rsi), %xmm0 - movdqu %xmm0, -126(%rdi) -.Lfwd_write_110bytes: - lddqu -110(%rsi), %xmm0 - movdqu %xmm0, -110(%rdi) -.Lfwd_write_94bytes: - lddqu -94(%rsi), %xmm0 - movdqu %xmm0, -94(%rdi) -.Lfwd_write_78bytes: - lddqu -78(%rsi), %xmm0 - movdqu %xmm0, -78(%rdi) -.Lfwd_write_62bytes: - lddqu -62(%rsi), %xmm0 - movdqu %xmm0, -62(%rdi) -.Lfwd_write_46bytes: - lddqu -46(%rsi), %xmm0 - movdqu %xmm0, -46(%rdi) -.Lfwd_write_30bytes: - lddqu -30(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -30(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_14bytes: - mov -14(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -14(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -.Lfwd_write_141bytes: - lddqu -141(%rsi), %xmm0 - movdqu %xmm0, -141(%rdi) -.Lfwd_write_125bytes: - lddqu -125(%rsi), %xmm0 - movdqu %xmm0, -125(%rdi) -.Lfwd_write_109bytes: - lddqu -109(%rsi), %xmm0 - movdqu %xmm0, -109(%rdi) -.Lfwd_write_93bytes: - lddqu -93(%rsi), %xmm0 - movdqu %xmm0, -93(%rdi) -.Lfwd_write_77bytes: - lddqu -77(%rsi), %xmm0 - movdqu %xmm0, -77(%rdi) -.Lfwd_write_61bytes: - lddqu -61(%rsi), %xmm0 - movdqu %xmm0, -61(%rdi) -.Lfwd_write_45bytes: - lddqu -45(%rsi), %xmm0 - movdqu %xmm0, -45(%rdi) -.Lfwd_write_29bytes: - lddqu -29(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -29(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_13bytes: - mov -13(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -13(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -.Lfwd_write_140bytes: - lddqu -140(%rsi), %xmm0 - movdqu %xmm0, -140(%rdi) -.Lfwd_write_124bytes: - lddqu -124(%rsi), %xmm0 - movdqu %xmm0, -124(%rdi) -.Lfwd_write_108bytes: - lddqu -108(%rsi), %xmm0 - movdqu %xmm0, -108(%rdi) -.Lfwd_write_92bytes: - lddqu -92(%rsi), %xmm0 - movdqu %xmm0, -92(%rdi) -.Lfwd_write_76bytes: - lddqu -76(%rsi), %xmm0 - movdqu %xmm0, -76(%rdi) -.Lfwd_write_60bytes: - lddqu -60(%rsi), %xmm0 - movdqu %xmm0, -60(%rdi) -.Lfwd_write_44bytes: - lddqu -44(%rsi), %xmm0 - movdqu %xmm0, -44(%rdi) -.Lfwd_write_28bytes: - lddqu -28(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -28(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_12bytes: - mov -12(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -12(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -.Lfwd_write_139bytes: - lddqu -139(%rsi), %xmm0 - movdqu %xmm0, -139(%rdi) -.Lfwd_write_123bytes: - lddqu -123(%rsi), %xmm0 - movdqu %xmm0, -123(%rdi) -.Lfwd_write_107bytes: - lddqu -107(%rsi), %xmm0 - movdqu %xmm0, -107(%rdi) -.Lfwd_write_91bytes: - lddqu -91(%rsi), %xmm0 - movdqu %xmm0, -91(%rdi) -.Lfwd_write_75bytes: - lddqu -75(%rsi), %xmm0 - movdqu %xmm0, -75(%rdi) -.Lfwd_write_59bytes: - lddqu -59(%rsi), %xmm0 - movdqu %xmm0, -59(%rdi) -.Lfwd_write_43bytes: - lddqu -43(%rsi), %xmm0 - movdqu %xmm0, -43(%rdi) -.Lfwd_write_27bytes: - lddqu -27(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -27(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_11bytes: - mov -11(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -11(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -.Lfwd_write_138bytes: - lddqu -138(%rsi), %xmm0 - movdqu %xmm0, -138(%rdi) -.Lfwd_write_122bytes: - lddqu -122(%rsi), %xmm0 - movdqu %xmm0, -122(%rdi) -.Lfwd_write_106bytes: - lddqu -106(%rsi), %xmm0 - movdqu %xmm0, -106(%rdi) -.Lfwd_write_90bytes: - lddqu -90(%rsi), %xmm0 - movdqu %xmm0, -90(%rdi) -.Lfwd_write_74bytes: - lddqu -74(%rsi), %xmm0 - movdqu %xmm0, -74(%rdi) -.Lfwd_write_58bytes: - lddqu -58(%rsi), %xmm0 - movdqu %xmm0, -58(%rdi) -.Lfwd_write_42bytes: - lddqu -42(%rsi), %xmm0 - movdqu %xmm0, -42(%rdi) -.Lfwd_write_26bytes: - lddqu -26(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -26(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_10bytes: - mov -10(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -10(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -.Lfwd_write_137bytes: - lddqu -137(%rsi), %xmm0 - movdqu %xmm0, -137(%rdi) -.Lfwd_write_121bytes: - lddqu -121(%rsi), %xmm0 - movdqu %xmm0, -121(%rdi) -.Lfwd_write_105bytes: - lddqu -105(%rsi), %xmm0 - movdqu %xmm0, -105(%rdi) -.Lfwd_write_89bytes: - lddqu -89(%rsi), %xmm0 - movdqu %xmm0, -89(%rdi) -.Lfwd_write_73bytes: - lddqu -73(%rsi), %xmm0 - movdqu %xmm0, -73(%rdi) -.Lfwd_write_57bytes: - lddqu -57(%rsi), %xmm0 - movdqu %xmm0, -57(%rdi) -.Lfwd_write_41bytes: - lddqu -41(%rsi), %xmm0 - movdqu %xmm0, -41(%rdi) -.Lfwd_write_25bytes: - lddqu -25(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -25(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_9bytes: - mov -9(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -9(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -.Lfwd_write_136bytes: - lddqu -136(%rsi), %xmm0 - movdqu %xmm0, -136(%rdi) -.Lfwd_write_120bytes: - lddqu -120(%rsi), %xmm0 - movdqu %xmm0, -120(%rdi) -.Lfwd_write_104bytes: - lddqu -104(%rsi), %xmm0 - movdqu %xmm0, -104(%rdi) -.Lfwd_write_88bytes: - lddqu -88(%rsi), %xmm0 - movdqu %xmm0, -88(%rdi) -.Lfwd_write_72bytes: - lddqu -72(%rsi), %xmm0 - movdqu %xmm0, -72(%rdi) -.Lfwd_write_56bytes: - lddqu -56(%rsi), %xmm0 - movdqu %xmm0, -56(%rdi) -.Lfwd_write_40bytes: - lddqu -40(%rsi), %xmm0 - movdqu %xmm0, -40(%rdi) -.Lfwd_write_24bytes: - lddqu -24(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -24(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_8bytes: - mov -8(%rsi), %rdx - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lfwd_write_135bytes: - lddqu -135(%rsi), %xmm0 - movdqu %xmm0, -135(%rdi) -.Lfwd_write_119bytes: - lddqu -119(%rsi), %xmm0 - movdqu %xmm0, -119(%rdi) -.Lfwd_write_103bytes: - lddqu -103(%rsi), %xmm0 - movdqu %xmm0, -103(%rdi) -.Lfwd_write_87bytes: - lddqu -87(%rsi), %xmm0 - movdqu %xmm0, -87(%rdi) -.Lfwd_write_71bytes: - lddqu -71(%rsi), %xmm0 - movdqu %xmm0, -71(%rdi) -.Lfwd_write_55bytes: - lddqu -55(%rsi), %xmm0 - movdqu %xmm0, -55(%rdi) -.Lfwd_write_39bytes: - lddqu -39(%rsi), %xmm0 - movdqu %xmm0, -39(%rdi) -.Lfwd_write_23bytes: - lddqu -23(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -23(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_7bytes: - mov -7(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -7(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -.Lfwd_write_134bytes: - lddqu -134(%rsi), %xmm0 - movdqu %xmm0, -134(%rdi) -.Lfwd_write_118bytes: - lddqu -118(%rsi), %xmm0 - movdqu %xmm0, -118(%rdi) -.Lfwd_write_102bytes: - lddqu -102(%rsi), %xmm0 - movdqu %xmm0, -102(%rdi) -.Lfwd_write_86bytes: - lddqu -86(%rsi), %xmm0 - movdqu %xmm0, -86(%rdi) -.Lfwd_write_70bytes: - lddqu -70(%rsi), %xmm0 - movdqu %xmm0, -70(%rdi) -.Lfwd_write_54bytes: - lddqu -54(%rsi), %xmm0 - movdqu %xmm0, -54(%rdi) -.Lfwd_write_38bytes: - lddqu -38(%rsi), %xmm0 - movdqu %xmm0, -38(%rdi) -.Lfwd_write_22bytes: - lddqu -22(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -22(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_6bytes: - mov -6(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -6(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -.Lfwd_write_133bytes: - lddqu -133(%rsi), %xmm0 - movdqu %xmm0, -133(%rdi) -.Lfwd_write_117bytes: - lddqu -117(%rsi), %xmm0 - movdqu %xmm0, -117(%rdi) -.Lfwd_write_101bytes: - lddqu -101(%rsi), %xmm0 - movdqu %xmm0, -101(%rdi) -.Lfwd_write_85bytes: - lddqu -85(%rsi), %xmm0 - movdqu %xmm0, -85(%rdi) -.Lfwd_write_69bytes: - lddqu -69(%rsi), %xmm0 - movdqu %xmm0, -69(%rdi) -.Lfwd_write_53bytes: - lddqu -53(%rsi), %xmm0 - movdqu %xmm0, -53(%rdi) -.Lfwd_write_37bytes: - lddqu -37(%rsi), %xmm0 - movdqu %xmm0, -37(%rdi) -.Lfwd_write_21bytes: - lddqu -21(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -21(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_5bytes: - mov -5(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -5(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -.Lfwd_write_132bytes: - lddqu -132(%rsi), %xmm0 - movdqu %xmm0, -132(%rdi) -.Lfwd_write_116bytes: - lddqu -116(%rsi), %xmm0 - movdqu %xmm0, -116(%rdi) -.Lfwd_write_100bytes: - lddqu -100(%rsi), %xmm0 - movdqu %xmm0, -100(%rdi) -.Lfwd_write_84bytes: - lddqu -84(%rsi), %xmm0 - movdqu %xmm0, -84(%rdi) -.Lfwd_write_68bytes: - lddqu -68(%rsi), %xmm0 - movdqu %xmm0, -68(%rdi) -.Lfwd_write_52bytes: - lddqu -52(%rsi), %xmm0 - movdqu %xmm0, -52(%rdi) -.Lfwd_write_36bytes: - lddqu -36(%rsi), %xmm0 - movdqu %xmm0, -36(%rdi) -.Lfwd_write_20bytes: - lddqu -20(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -20(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_4bytes: - mov -4(%rsi), %edx - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lfwd_write_131bytes: - lddqu -131(%rsi), %xmm0 - movdqu %xmm0, -131(%rdi) -.Lfwd_write_115bytes: - lddqu -115(%rsi), %xmm0 - movdqu %xmm0, -115(%rdi) -.Lfwd_write_99bytes: - lddqu -99(%rsi), %xmm0 - movdqu %xmm0, -99(%rdi) -.Lfwd_write_83bytes: - lddqu -83(%rsi), %xmm0 - movdqu %xmm0, -83(%rdi) -.Lfwd_write_67bytes: - lddqu -67(%rsi), %xmm0 - movdqu %xmm0, -67(%rdi) -.Lfwd_write_51bytes: - lddqu -51(%rsi), %xmm0 - movdqu %xmm0, -51(%rdi) -.Lfwd_write_35bytes: - lddqu -35(%rsi), %xmm0 - movdqu %xmm0, -35(%rdi) -.Lfwd_write_19bytes: - lddqu -19(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -19(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_3bytes: - mov -3(%rsi), %dx - mov -2(%rsi), %cx - mov %dx, -3(%rdi) - mov %cx, -2(%rdi) - ret - - .p2align 4 -.Lfwd_write_130bytes: - lddqu -130(%rsi), %xmm0 - movdqu %xmm0, -130(%rdi) -.Lfwd_write_114bytes: - lddqu -114(%rsi), %xmm0 - movdqu %xmm0, -114(%rdi) -.Lfwd_write_98bytes: - lddqu -98(%rsi), %xmm0 - movdqu %xmm0, -98(%rdi) -.Lfwd_write_82bytes: - lddqu -82(%rsi), %xmm0 - movdqu %xmm0, -82(%rdi) -.Lfwd_write_66bytes: - lddqu -66(%rsi), %xmm0 - movdqu %xmm0, -66(%rdi) -.Lfwd_write_50bytes: - lddqu -50(%rsi), %xmm0 - movdqu %xmm0, -50(%rdi) -.Lfwd_write_34bytes: - lddqu -34(%rsi), %xmm0 - movdqu %xmm0, -34(%rdi) -.Lfwd_write_18bytes: - lddqu -18(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -18(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_2bytes: - movzwl -2(%rsi), %edx - mov %dx, -2(%rdi) - ret - - .p2align 4 -.Lfwd_write_129bytes: - lddqu -129(%rsi), %xmm0 - movdqu %xmm0, -129(%rdi) -.Lfwd_write_113bytes: - lddqu -113(%rsi), %xmm0 - movdqu %xmm0, -113(%rdi) -.Lfwd_write_97bytes: - lddqu -97(%rsi), %xmm0 - movdqu %xmm0, -97(%rdi) -.Lfwd_write_81bytes: - lddqu -81(%rsi), %xmm0 - movdqu %xmm0, -81(%rdi) -.Lfwd_write_65bytes: - lddqu -65(%rsi), %xmm0 - movdqu %xmm0, -65(%rdi) -.Lfwd_write_49bytes: - lddqu -49(%rsi), %xmm0 - movdqu %xmm0, -49(%rdi) -.Lfwd_write_33bytes: - lddqu -33(%rsi), %xmm0 - movdqu %xmm0, -33(%rdi) -.Lfwd_write_17bytes: - lddqu -17(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -17(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_1bytes: - movzbl -1(%rsi), %edx - mov %dl, -1(%rdi) - ret - - .p2align 4 -.Lbwd_write_128bytes: - lddqu 112(%rsi), %xmm0 - movdqu %xmm0, 112(%rdi) -.Lbwd_write_112bytes: - lddqu 96(%rsi), %xmm0 - movdqu %xmm0, 96(%rdi) -.Lbwd_write_96bytes: - lddqu 80(%rsi), %xmm0 - movdqu %xmm0, 80(%rdi) -.Lbwd_write_80bytes: - lddqu 64(%rsi), %xmm0 - movdqu %xmm0, 64(%rdi) -.Lbwd_write_64bytes: - lddqu 48(%rsi), %xmm0 - movdqu %xmm0, 48(%rdi) -.Lbwd_write_48bytes: - lddqu 32(%rsi), %xmm0 - movdqu %xmm0, 32(%rdi) -.Lbwd_write_32bytes: - lddqu 16(%rsi), %xmm0 - movdqu %xmm0, 16(%rdi) -.Lbwd_write_16bytes: - lddqu (%rsi), %xmm0 - movdqu %xmm0, (%rdi) -.Lbwd_write_0bytes: - ret - - .p2align 4 -.Lbwd_write_143bytes: - lddqu 127(%rsi), %xmm0 - movdqu %xmm0, 127(%rdi) -.Lbwd_write_127bytes: - lddqu 111(%rsi), %xmm0 - movdqu %xmm0, 111(%rdi) -.Lbwd_write_111bytes: - lddqu 95(%rsi), %xmm0 - movdqu %xmm0, 95(%rdi) -.Lbwd_write_95bytes: - lddqu 79(%rsi), %xmm0 - movdqu %xmm0, 79(%rdi) -.Lbwd_write_79bytes: - lddqu 63(%rsi), %xmm0 - movdqu %xmm0, 63(%rdi) -.Lbwd_write_63bytes: - lddqu 47(%rsi), %xmm0 - movdqu %xmm0, 47(%rdi) -.Lbwd_write_47bytes: - lddqu 31(%rsi), %xmm0 - movdqu %xmm0, 31(%rdi) -.Lbwd_write_31bytes: - lddqu 15(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 15(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_15bytes: - mov 7(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 7(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_142bytes: - lddqu 126(%rsi), %xmm0 - movdqu %xmm0, 126(%rdi) -.Lbwd_write_126bytes: - lddqu 110(%rsi), %xmm0 - movdqu %xmm0, 110(%rdi) -.Lbwd_write_110bytes: - lddqu 94(%rsi), %xmm0 - movdqu %xmm0, 94(%rdi) -.Lbwd_write_94bytes: - lddqu 78(%rsi), %xmm0 - movdqu %xmm0, 78(%rdi) -.Lbwd_write_78bytes: - lddqu 62(%rsi), %xmm0 - movdqu %xmm0, 62(%rdi) -.Lbwd_write_62bytes: - lddqu 46(%rsi), %xmm0 - movdqu %xmm0, 46(%rdi) -.Lbwd_write_46bytes: - lddqu 30(%rsi), %xmm0 - movdqu %xmm0, 30(%rdi) -.Lbwd_write_30bytes: - lddqu 14(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 14(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_14bytes: - mov 6(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 6(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_141bytes: - lddqu 125(%rsi), %xmm0 - movdqu %xmm0, 125(%rdi) -.Lbwd_write_125bytes: - lddqu 109(%rsi), %xmm0 - movdqu %xmm0, 109(%rdi) -.Lbwd_write_109bytes: - lddqu 93(%rsi), %xmm0 - movdqu %xmm0, 93(%rdi) -.Lbwd_write_93bytes: - lddqu 77(%rsi), %xmm0 - movdqu %xmm0, 77(%rdi) -.Lbwd_write_77bytes: - lddqu 61(%rsi), %xmm0 - movdqu %xmm0, 61(%rdi) -.Lbwd_write_61bytes: - lddqu 45(%rsi), %xmm0 - movdqu %xmm0, 45(%rdi) -.Lbwd_write_45bytes: - lddqu 29(%rsi), %xmm0 - movdqu %xmm0, 29(%rdi) -.Lbwd_write_29bytes: - lddqu 13(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 13(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_13bytes: - mov 5(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 5(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_140bytes: - lddqu 124(%rsi), %xmm0 - movdqu %xmm0, 124(%rdi) -.Lbwd_write_124bytes: - lddqu 108(%rsi), %xmm0 - movdqu %xmm0, 108(%rdi) -.Lbwd_write_108bytes: - lddqu 92(%rsi), %xmm0 - movdqu %xmm0, 92(%rdi) -.Lbwd_write_92bytes: - lddqu 76(%rsi), %xmm0 - movdqu %xmm0, 76(%rdi) -.Lbwd_write_76bytes: - lddqu 60(%rsi), %xmm0 - movdqu %xmm0, 60(%rdi) -.Lbwd_write_60bytes: - lddqu 44(%rsi), %xmm0 - movdqu %xmm0, 44(%rdi) -.Lbwd_write_44bytes: - lddqu 28(%rsi), %xmm0 - movdqu %xmm0, 28(%rdi) -.Lbwd_write_28bytes: - lddqu 12(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 12(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_12bytes: - mov 4(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 4(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_139bytes: - lddqu 123(%rsi), %xmm0 - movdqu %xmm0, 123(%rdi) -.Lbwd_write_123bytes: - lddqu 107(%rsi), %xmm0 - movdqu %xmm0, 107(%rdi) -.Lbwd_write_107bytes: - lddqu 91(%rsi), %xmm0 - movdqu %xmm0, 91(%rdi) -.Lbwd_write_91bytes: - lddqu 75(%rsi), %xmm0 - movdqu %xmm0, 75(%rdi) -.Lbwd_write_75bytes: - lddqu 59(%rsi), %xmm0 - movdqu %xmm0, 59(%rdi) -.Lbwd_write_59bytes: - lddqu 43(%rsi), %xmm0 - movdqu %xmm0, 43(%rdi) -.Lbwd_write_43bytes: - lddqu 27(%rsi), %xmm0 - movdqu %xmm0, 27(%rdi) -.Lbwd_write_27bytes: - lddqu 11(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 11(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_11bytes: - mov 3(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 3(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_138bytes: - lddqu 122(%rsi), %xmm0 - movdqu %xmm0, 122(%rdi) -.Lbwd_write_122bytes: - lddqu 106(%rsi), %xmm0 - movdqu %xmm0, 106(%rdi) -.Lbwd_write_106bytes: - lddqu 90(%rsi), %xmm0 - movdqu %xmm0, 90(%rdi) -.Lbwd_write_90bytes: - lddqu 74(%rsi), %xmm0 - movdqu %xmm0, 74(%rdi) -.Lbwd_write_74bytes: - lddqu 58(%rsi), %xmm0 - movdqu %xmm0, 58(%rdi) -.Lbwd_write_58bytes: - lddqu 42(%rsi), %xmm0 - movdqu %xmm0, 42(%rdi) -.Lbwd_write_42bytes: - lddqu 26(%rsi), %xmm0 - movdqu %xmm0, 26(%rdi) -.Lbwd_write_26bytes: - lddqu 10(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 10(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_10bytes: - mov 2(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 2(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_137bytes: - lddqu 121(%rsi), %xmm0 - movdqu %xmm0, 121(%rdi) -.Lbwd_write_121bytes: - lddqu 105(%rsi), %xmm0 - movdqu %xmm0, 105(%rdi) -.Lbwd_write_105bytes: - lddqu 89(%rsi), %xmm0 - movdqu %xmm0, 89(%rdi) -.Lbwd_write_89bytes: - lddqu 73(%rsi), %xmm0 - movdqu %xmm0, 73(%rdi) -.Lbwd_write_73bytes: - lddqu 57(%rsi), %xmm0 - movdqu %xmm0, 57(%rdi) -.Lbwd_write_57bytes: - lddqu 41(%rsi), %xmm0 - movdqu %xmm0, 41(%rdi) -.Lbwd_write_41bytes: - lddqu 25(%rsi), %xmm0 - movdqu %xmm0, 25(%rdi) -.Lbwd_write_25bytes: - lddqu 9(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 9(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_9bytes: - mov 1(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 1(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_136bytes: - lddqu 120(%rsi), %xmm0 - movdqu %xmm0, 120(%rdi) -.Lbwd_write_120bytes: - lddqu 104(%rsi), %xmm0 - movdqu %xmm0, 104(%rdi) -.Lbwd_write_104bytes: - lddqu 88(%rsi), %xmm0 - movdqu %xmm0, 88(%rdi) -.Lbwd_write_88bytes: - lddqu 72(%rsi), %xmm0 - movdqu %xmm0, 72(%rdi) -.Lbwd_write_72bytes: - lddqu 56(%rsi), %xmm0 - movdqu %xmm0, 56(%rdi) -.Lbwd_write_56bytes: - lddqu 40(%rsi), %xmm0 - movdqu %xmm0, 40(%rdi) -.Lbwd_write_40bytes: - lddqu 24(%rsi), %xmm0 - movdqu %xmm0, 24(%rdi) -.Lbwd_write_24bytes: - lddqu 8(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 8(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_8bytes: - mov (%rsi), %rdx - mov %rdx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_135bytes: - lddqu 119(%rsi), %xmm0 - movdqu %xmm0, 119(%rdi) -.Lbwd_write_119bytes: - lddqu 103(%rsi), %xmm0 - movdqu %xmm0, 103(%rdi) -.Lbwd_write_103bytes: - lddqu 87(%rsi), %xmm0 - movdqu %xmm0, 87(%rdi) -.Lbwd_write_87bytes: - lddqu 71(%rsi), %xmm0 - movdqu %xmm0, 71(%rdi) -.Lbwd_write_71bytes: - lddqu 55(%rsi), %xmm0 - movdqu %xmm0, 55(%rdi) -.Lbwd_write_55bytes: - lddqu 39(%rsi), %xmm0 - movdqu %xmm0, 39(%rdi) -.Lbwd_write_39bytes: - lddqu 23(%rsi), %xmm0 - movdqu %xmm0, 23(%rdi) -.Lbwd_write_23bytes: - lddqu 7(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 7(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_7bytes: - mov 3(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 3(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_134bytes: - lddqu 118(%rsi), %xmm0 - movdqu %xmm0, 118(%rdi) -.Lbwd_write_118bytes: - lddqu 102(%rsi), %xmm0 - movdqu %xmm0, 102(%rdi) -.Lbwd_write_102bytes: - lddqu 86(%rsi), %xmm0 - movdqu %xmm0, 86(%rdi) -.Lbwd_write_86bytes: - lddqu 70(%rsi), %xmm0 - movdqu %xmm0, 70(%rdi) -.Lbwd_write_70bytes: - lddqu 54(%rsi), %xmm0 - movdqu %xmm0, 54(%rdi) -.Lbwd_write_54bytes: - lddqu 38(%rsi), %xmm0 - movdqu %xmm0, 38(%rdi) -.Lbwd_write_38bytes: - lddqu 22(%rsi), %xmm0 - movdqu %xmm0, 22(%rdi) -.Lbwd_write_22bytes: - lddqu 6(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 6(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_6bytes: - mov 2(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 2(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_133bytes: - lddqu 117(%rsi), %xmm0 - movdqu %xmm0, 117(%rdi) -.Lbwd_write_117bytes: - lddqu 101(%rsi), %xmm0 - movdqu %xmm0, 101(%rdi) -.Lbwd_write_101bytes: - lddqu 85(%rsi), %xmm0 - movdqu %xmm0, 85(%rdi) -.Lbwd_write_85bytes: - lddqu 69(%rsi), %xmm0 - movdqu %xmm0, 69(%rdi) -.Lbwd_write_69bytes: - lddqu 53(%rsi), %xmm0 - movdqu %xmm0, 53(%rdi) -.Lbwd_write_53bytes: - lddqu 37(%rsi), %xmm0 - movdqu %xmm0, 37(%rdi) -.Lbwd_write_37bytes: - lddqu 21(%rsi), %xmm0 - movdqu %xmm0, 21(%rdi) -.Lbwd_write_21bytes: - lddqu 5(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 5(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_5bytes: - mov 1(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 1(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_132bytes: - lddqu 116(%rsi), %xmm0 - movdqu %xmm0, 116(%rdi) -.Lbwd_write_116bytes: - lddqu 100(%rsi), %xmm0 - movdqu %xmm0, 100(%rdi) -.Lbwd_write_100bytes: - lddqu 84(%rsi), %xmm0 - movdqu %xmm0, 84(%rdi) -.Lbwd_write_84bytes: - lddqu 68(%rsi), %xmm0 - movdqu %xmm0, 68(%rdi) -.Lbwd_write_68bytes: - lddqu 52(%rsi), %xmm0 - movdqu %xmm0, 52(%rdi) -.Lbwd_write_52bytes: - lddqu 36(%rsi), %xmm0 - movdqu %xmm0, 36(%rdi) -.Lbwd_write_36bytes: - lddqu 20(%rsi), %xmm0 - movdqu %xmm0, 20(%rdi) -.Lbwd_write_20bytes: - lddqu 4(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 4(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_4bytes: - mov (%rsi), %edx - mov %edx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_131bytes: - lddqu 115(%rsi), %xmm0 - movdqu %xmm0, 115(%rdi) -.Lbwd_write_115bytes: - lddqu 99(%rsi), %xmm0 - movdqu %xmm0, 99(%rdi) -.Lbwd_write_99bytes: - lddqu 83(%rsi), %xmm0 - movdqu %xmm0, 83(%rdi) -.Lbwd_write_83bytes: - lddqu 67(%rsi), %xmm0 - movdqu %xmm0, 67(%rdi) -.Lbwd_write_67bytes: - lddqu 51(%rsi), %xmm0 - movdqu %xmm0, 51(%rdi) -.Lbwd_write_51bytes: - lddqu 35(%rsi), %xmm0 - movdqu %xmm0, 35(%rdi) -.Lbwd_write_35bytes: - lddqu 19(%rsi), %xmm0 - movdqu %xmm0, 19(%rdi) -.Lbwd_write_19bytes: - lddqu 3(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 3(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_3bytes: - mov 1(%rsi), %dx - mov (%rsi), %cx - mov %dx, 1(%rdi) - mov %cx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_130bytes: - lddqu 114(%rsi), %xmm0 - movdqu %xmm0, 114(%rdi) -.Lbwd_write_114bytes: - lddqu 98(%rsi), %xmm0 - movdqu %xmm0, 98(%rdi) -.Lbwd_write_98bytes: - lddqu 82(%rsi), %xmm0 - movdqu %xmm0, 82(%rdi) -.Lbwd_write_82bytes: - lddqu 66(%rsi), %xmm0 - movdqu %xmm0, 66(%rdi) -.Lbwd_write_66bytes: - lddqu 50(%rsi), %xmm0 - movdqu %xmm0, 50(%rdi) -.Lbwd_write_50bytes: - lddqu 34(%rsi), %xmm0 - movdqu %xmm0, 34(%rdi) -.Lbwd_write_34bytes: - lddqu 18(%rsi), %xmm0 - movdqu %xmm0, 18(%rdi) -.Lbwd_write_18bytes: - lddqu 2(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 2(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_2bytes: - movzwl (%rsi), %edx - mov %dx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_129bytes: - lddqu 113(%rsi), %xmm0 - movdqu %xmm0, 113(%rdi) -.Lbwd_write_113bytes: - lddqu 97(%rsi), %xmm0 - movdqu %xmm0, 97(%rdi) -.Lbwd_write_97bytes: - lddqu 81(%rsi), %xmm0 - movdqu %xmm0, 81(%rdi) -.Lbwd_write_81bytes: - lddqu 65(%rsi), %xmm0 - movdqu %xmm0, 65(%rdi) -.Lbwd_write_65bytes: - lddqu 49(%rsi), %xmm0 - movdqu %xmm0, 49(%rdi) -.Lbwd_write_49bytes: - lddqu 33(%rsi), %xmm0 - movdqu %xmm0, 33(%rdi) -.Lbwd_write_33bytes: - lddqu 17(%rsi), %xmm0 - movdqu %xmm0, 17(%rdi) -.Lbwd_write_17bytes: - lddqu 1(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 1(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_1bytes: - movzbl (%rsi), %edx - mov %dl, (%rdi) - ret - - .section .rodata.ssse3 - .p2align 3 -.Ltable_144_bytes_bwd: - .int .Lbwd_write_0bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_1bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_2bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_3bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_4bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_5bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_6bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_7bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_8bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_9bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_10bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_11bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_12bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_13bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_14bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_15bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_16bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_17bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_18bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_19bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_20bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_21bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_22bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_23bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_24bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_25bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_26bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_27bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_28bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_29bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_30bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_31bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_32bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_33bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_34bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_35bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_36bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_37bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_38bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_39bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_40bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_41bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_42bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_43bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_44bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_45bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_46bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_47bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_48bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_49bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_50bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_51bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_52bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_53bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_54bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_55bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_56bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_57bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_58bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_59bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_60bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_61bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_62bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_63bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_64bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_65bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_66bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_67bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_68bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_69bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_70bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_71bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_72bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_73bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_74bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_75bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_76bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_77bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_78bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_79bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_80bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_81bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_82bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_83bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_84bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_85bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_86bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_87bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_88bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_89bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_90bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_91bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_92bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_93bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_94bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_95bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_96bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_97bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_98bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_99bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_100bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_101bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_102bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_103bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_104bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_105bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_106bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_107bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_108bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_109bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_110bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_111bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_112bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_113bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_114bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_115bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_116bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_117bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_118bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_119bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_120bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_121bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_122bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_123bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_124bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_125bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_126bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_127bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_128bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_129bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_130bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_131bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_132bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_133bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_134bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_135bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_136bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_137bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_138bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_139bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_140bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_141bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_142bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_143bytes - .Ltable_144_bytes_bwd - - .p2align 3 -.Ltable_144_bytes_fwd: - .int .Lfwd_write_0bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_1bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_2bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_3bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_4bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_5bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_6bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_7bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_8bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_9bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_10bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_11bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_12bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_13bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_14bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_15bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_16bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_17bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_18bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_19bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_20bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_21bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_22bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_23bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_24bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_25bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_26bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_27bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_28bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_29bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_30bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_31bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_32bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_33bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_34bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_35bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_36bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_37bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_38bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_39bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_40bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_41bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_42bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_43bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_44bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_45bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_46bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_47bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_48bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_49bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_50bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_51bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_52bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_53bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_54bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_55bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_56bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_57bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_58bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_59bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_60bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_61bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_62bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_63bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_64bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_65bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_66bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_67bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_68bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_69bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_70bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_71bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_72bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_73bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_74bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_75bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_76bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_77bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_78bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_79bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_80bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_81bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_82bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_83bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_84bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_85bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_86bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_87bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_88bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_89bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_90bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_91bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_92bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_93bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_94bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_95bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_96bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_97bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_98bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_99bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_100bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_101bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_102bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_103bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_104bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_105bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_106bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_107bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_108bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_109bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_110bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_111bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_112bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_113bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_114bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_115bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_116bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_117bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_118bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_119bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_120bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_121bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_122bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_123bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_124bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_125bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_126bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_127bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_128bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_129bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_130bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_131bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_132bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_133bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_134bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_135bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_136bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_137bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_138bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_139bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_140bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_141bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_142bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_143bytes - .Ltable_144_bytes_fwd - - .p2align 3 -.Lshl_table_fwd: - .int .Lshl_0 - .Lshl_table_fwd - .int .Lshl_1 - .Lshl_table_fwd - .int .Lshl_2 - .Lshl_table_fwd - .int .Lshl_3 - .Lshl_table_fwd - .int .Lshl_4 - .Lshl_table_fwd - .int .Lshl_5 - .Lshl_table_fwd - .int .Lshl_6 - .Lshl_table_fwd - .int .Lshl_7 - .Lshl_table_fwd - .int .Lshl_8 - .Lshl_table_fwd - .int .Lshl_9 - .Lshl_table_fwd - .int .Lshl_10 - .Lshl_table_fwd - .int .Lshl_11 - .Lshl_table_fwd - .int .Lshl_12 - .Lshl_table_fwd - .int .Lshl_13 - .Lshl_table_fwd - .int .Lshl_14 - .Lshl_table_fwd - .int .Lshl_15 - .Lshl_table_fwd - - .p2align 3 -.Lshl_table_bwd: - .int .Lshl_0_bwd - .Lshl_table_bwd - .int .Lshl_1_bwd - .Lshl_table_bwd - .int .Lshl_2_bwd - .Lshl_table_bwd - .int .Lshl_3_bwd - .Lshl_table_bwd - .int .Lshl_4_bwd - .Lshl_table_bwd - .int .Lshl_5_bwd - .Lshl_table_bwd - .int .Lshl_6_bwd - .Lshl_table_bwd - .int .Lshl_7_bwd - .Lshl_table_bwd - .int .Lshl_8_bwd - .Lshl_table_bwd - .int .Lshl_9_bwd - .Lshl_table_bwd - .int .Lshl_10_bwd - .Lshl_table_bwd - .int .Lshl_11_bwd - .Lshl_table_bwd - .int .Lshl_12_bwd - .Lshl_table_bwd - .int .Lshl_13_bwd - .Lshl_table_bwd - .int .Lshl_14_bwd - .Lshl_table_bwd - .int .Lshl_15_bwd - .Lshl_table_bwd diff --git a/impls/memcpy-ssse3.s b/impls/memcpy-ssse3.s deleted file mode 100644 index e83ec95..0000000 --- a/impls/memcpy-ssse3.s +++ /dev/null @@ -1,2944 +0,0 @@ - .section .text.ssse3 - -.globl __mempcpy_ssse3 -__mempcpy_ssse3: - mov %rdi, %rax - add %rdx, %rax - jmp .Lstart - -.globl __memcpy_ssse3 -__memcpy_ssse3: - mov %rdi, %rax -.Lstart: - cmp $79, %rdx - lea .Ltable_less_80bytes(%rip), %r11 - ja .L80bytesormore - movslq (%r11, %rdx, 4), %r9 - add %rdx, %rsi - add %rdx, %rdi - add %r11, %r9 - jmp *%r9 - ud2 - - .p2align 4 -.L80bytesormore: - - cmp %dil, %sil - jle .Lcopy_backward - - movdqu (%rsi), %xmm0 - mov %rdi, %rcx - and $-16, %rdi - add $16, %rdi - mov %rcx, %r8 - sub %rdi, %rcx - add %rcx, %rdx - sub %rcx, %rsi - - mov __x86_shared_cache_size_half(%rip), %rcx - - cmp %rcx, %rdx - mov %rsi, %r9 - ja .Llarge_page_fwd - and $0xf, %r9 - jz .Lshl_0 - - mov __x86_data_cache_size_half(%rip), %rcx - - lea .Lshl_table(%rip), %r11; movslq (%r11, %r9, 4), %r9; lea (%r11, %r9), %r9; jmp *%r9; ud2 - - .p2align 4 -.Lcopy_backward: - movdqu -16(%rsi, %rdx), %xmm0 - add %rdx, %rsi - lea -16(%rdi, %rdx), %r8 - add %rdx, %rdi - - mov %rdi, %rcx - and $0xf, %rcx - xor %rcx, %rdi - sub %rcx, %rdx - sub %rcx, %rsi - - mov __x86_shared_cache_size_half(%rip), %rcx - - cmp %rcx, %rdx - mov %rsi, %r9 - ja .Llarge_page_bwd - and $0xf, %r9 - jz .Lshl_0_bwd - - mov __x86_data_cache_size_half(%rip), %rcx - - lea .Lshl_table_bwd(%rip), %r11; movslq (%r11, %r9, 4), %r9; lea (%r11, %r9), %r9; jmp *%r9; ud2 - - .p2align 4 -.Lshl_0: - sub $16, %rdx - movdqa (%rsi), %xmm1 - add $16, %rsi - movdqa %xmm1, (%rdi) - add $16, %rdi - cmp $128, %rdx - movdqu %xmm0, (%r8) - ja .Lshl_0_gobble - cmp $64, %rdx - jb .Lshl_0_less_64bytes - movaps (%rsi), %xmm4 - movaps 16(%rsi), %xmm1 - movaps 32(%rsi), %xmm2 - movaps 48(%rsi), %xmm3 - movaps %xmm4, (%rdi) - movaps %xmm1, 16(%rdi) - movaps %xmm2, 32(%rdi) - movaps %xmm3, 48(%rdi) - sub $64, %rdx - add $64, %rsi - add $64, %rdi -.Lshl_0_less_64bytes: - add %rdx, %rsi - add %rdx, %rdi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_0_gobble: - - cmp __x86_data_cache_size_half(%rip), %rdx - - lea -128(%rdx), %rdx - jae .Lshl_0_gobble_mem_loop -.Lshl_0_gobble_cache_loop: - movdqa (%rsi), %xmm4 - movaps 0x10(%rsi), %xmm1 - movaps 0x20(%rsi), %xmm2 - movaps 0x30(%rsi), %xmm3 - - movdqa %xmm4, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - - sub $128, %rdx - movaps 0x40(%rsi), %xmm4 - movaps 0x50(%rsi), %xmm5 - movaps 0x60(%rsi), %xmm6 - movaps 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - movaps %xmm4, 0x40(%rdi) - movaps %xmm5, 0x50(%rdi) - movaps %xmm6, 0x60(%rdi) - movaps %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - - jae .Lshl_0_gobble_cache_loop - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl .Lshl_0_cache_less_64bytes - - movdqa (%rsi), %xmm4 - sub $0x40, %rdx - movdqa 0x10(%rsi), %xmm1 - - movdqa %xmm4, (%rdi) - movdqa %xmm1, 0x10(%rdi) - - movdqa 0x20(%rsi), %xmm4 - movdqa 0x30(%rsi), %xmm1 - add $0x40, %rsi - - movdqa %xmm4, 0x20(%rdi) - movdqa %xmm1, 0x30(%rdi) - add $0x40, %rdi -.Lshl_0_cache_less_64bytes: - add %rdx, %rsi - add %rdx, %rdi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_0_gobble_mem_loop: - prefetcht0 0x1c0(%rsi) - prefetcht0 0x280(%rsi) - - movdqa (%rsi), %xmm0 - movdqa 0x10(%rsi), %xmm1 - movdqa 0x20(%rsi), %xmm2 - movdqa 0x30(%rsi), %xmm3 - movdqa 0x40(%rsi), %xmm4 - movdqa 0x50(%rsi), %xmm5 - movdqa 0x60(%rsi), %xmm6 - movdqa 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - sub $0x80, %rdx - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - movdqa %xmm2, 0x20(%rdi) - movdqa %xmm3, 0x30(%rdi) - movdqa %xmm4, 0x40(%rdi) - movdqa %xmm5, 0x50(%rdi) - movdqa %xmm6, 0x60(%rdi) - movdqa %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - - jae .Lshl_0_gobble_mem_loop - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl .Lshl_0_mem_less_64bytes - - movdqa (%rsi), %xmm0 - sub $0x40, %rdx - movdqa 0x10(%rsi), %xmm1 - - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - - movdqa 0x20(%rsi), %xmm0 - movdqa 0x30(%rsi), %xmm1 - add $0x40, %rsi - - movdqa %xmm0, 0x20(%rdi) - movdqa %xmm1, 0x30(%rdi) - add $0x40, %rdi -.Lshl_0_mem_less_64bytes: - cmp $0x20, %rdx - jb .Lshl_0_mem_less_32bytes - movdqa (%rsi), %xmm0 - sub $0x20, %rdx - movdqa 0x10(%rsi), %xmm1 - add $0x20, %rsi - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - add $0x20, %rdi -.Lshl_0_mem_less_32bytes: - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_0_bwd: - sub $16, %rdx - movdqa -0x10(%rsi), %xmm1 - sub $16, %rsi - movdqa %xmm1, -0x10(%rdi) - sub $16, %rdi - cmp $0x80, %rdx - movdqu %xmm0, (%r8) - ja .Lshl_0_gobble_bwd - cmp $64, %rdx - jb .Lshl_0_less_64bytes_bwd - movaps -0x10(%rsi), %xmm0 - movaps -0x20(%rsi), %xmm1 - movaps -0x30(%rsi), %xmm2 - movaps -0x40(%rsi), %xmm3 - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - sub $64, %rdx - sub $0x40, %rsi - sub $0x40, %rdi -.Lshl_0_less_64bytes_bwd: - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_0_gobble_bwd: - - cmp __x86_data_cache_size_half(%rip), %rdx - - lea -128(%rdx), %rdx - jae .Lshl_0_gobble_mem_bwd_loop -.Lshl_0_gobble_bwd_loop: - movdqa -0x10(%rsi), %xmm0 - movaps -0x20(%rsi), %xmm1 - movaps -0x30(%rsi), %xmm2 - movaps -0x40(%rsi), %xmm3 - - movdqa %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - - sub $0x80, %rdx - movaps -0x50(%rsi), %xmm4 - movaps -0x60(%rsi), %xmm5 - movaps -0x70(%rsi), %xmm6 - movaps -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - movaps %xmm4, -0x50(%rdi) - movaps %xmm5, -0x60(%rdi) - movaps %xmm6, -0x70(%rdi) - movaps %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - - jae .Lshl_0_gobble_bwd_loop - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl .Lshl_0_gobble_bwd_less_64bytes - - movdqa -0x10(%rsi), %xmm0 - sub $0x40, %rdx - movdqa -0x20(%rsi), %xmm1 - - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - - movdqa -0x30(%rsi), %xmm0 - movdqa -0x40(%rsi), %xmm1 - sub $0x40, %rsi - - movdqa %xmm0, -0x30(%rdi) - movdqa %xmm1, -0x40(%rdi) - sub $0x40, %rdi -.Lshl_0_gobble_bwd_less_64bytes: - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_0_gobble_mem_bwd_loop: - prefetcht0 -0x1c0(%rsi) - prefetcht0 -0x280(%rsi) - movdqa -0x10(%rsi), %xmm0 - movdqa -0x20(%rsi), %xmm1 - movdqa -0x30(%rsi), %xmm2 - movdqa -0x40(%rsi), %xmm3 - movdqa -0x50(%rsi), %xmm4 - movdqa -0x60(%rsi), %xmm5 - movdqa -0x70(%rsi), %xmm6 - movdqa -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - sub $0x80, %rdx - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - movdqa %xmm2, -0x30(%rdi) - movdqa %xmm3, -0x40(%rdi) - movdqa %xmm4, -0x50(%rdi) - movdqa %xmm5, -0x60(%rdi) - movdqa %xmm6, -0x70(%rdi) - movdqa %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - - jae .Lshl_0_gobble_mem_bwd_loop - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl .Lshl_0_mem_bwd_less_64bytes - - movdqa -0x10(%rsi), %xmm0 - sub $0x40, %rdx - movdqa -0x20(%rsi), %xmm1 - - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - - movdqa -0x30(%rsi), %xmm0 - movdqa -0x40(%rsi), %xmm1 - sub $0x40, %rsi - - movdqa %xmm0, -0x30(%rdi) - movdqa %xmm1, -0x40(%rdi) - sub $0x40, %rdi -.Lshl_0_mem_bwd_less_64bytes: - cmp $0x20, %rdx - jb .Lshl_0_mem_bwd_less_32bytes - movdqa -0x10(%rsi), %xmm0 - sub $0x20, %rdx - movdqa -0x20(%rsi), %xmm1 - sub $0x20, %rsi - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - sub $0x20, %rdi -.Lshl_0_mem_bwd_less_32bytes: - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_1: - lea (.Lshl_1_loop_L1-.Lshl_1)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x01(%rsi), %xmm1 - jb .LL1_fwd - lea (.Lshl_1_loop_L2-.Lshl_1_loop_L1)(%r9), %r9 -.LL1_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_1_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_1_loop_L1: - sub $64, %rdx - movaps 0x0f(%rsi), %xmm2 - movaps 0x1f(%rsi), %xmm3 - movaps 0x2f(%rsi), %xmm4 - movaps 0x3f(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $1, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $1, %xmm3, %xmm4 - palignr $1, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $1, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_1_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_1_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_1_bwd: - lea (.Lshl_1_bwd_loop_L1-.Lshl_1_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x01(%rsi), %xmm1 - jb .LL1_bwd - lea (.Lshl_1_bwd_loop_L2-.Lshl_1_bwd_loop_L1)(%r9), %r9 -.LL1_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_1_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_1_bwd_loop_L1: - movaps -0x11(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x21(%rsi), %xmm3 - movaps -0x31(%rsi), %xmm4 - movaps -0x41(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $1, %xmm2, %xmm1 - palignr $1, %xmm3, %xmm2 - palignr $1, %xmm4, %xmm3 - palignr $1, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_1_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_1_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_2: - lea (.Lshl_2_loop_L1-.Lshl_2)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x02(%rsi), %xmm1 - jb .LL2_fwd - lea (.Lshl_2_loop_L2-.Lshl_2_loop_L1)(%r9), %r9 -.LL2_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_2_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_2_loop_L1: - sub $64, %rdx - movaps 0x0e(%rsi), %xmm2 - movaps 0x1e(%rsi), %xmm3 - movaps 0x2e(%rsi), %xmm4 - movaps 0x3e(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $2, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $2, %xmm3, %xmm4 - palignr $2, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $2, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_2_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_2_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_2_bwd: - lea (.Lshl_2_bwd_loop_L1-.Lshl_2_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x02(%rsi), %xmm1 - jb .LL2_bwd - lea (.Lshl_2_bwd_loop_L2-.Lshl_2_bwd_loop_L1)(%r9), %r9 -.LL2_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_2_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_2_bwd_loop_L1: - movaps -0x12(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x22(%rsi), %xmm3 - movaps -0x32(%rsi), %xmm4 - movaps -0x42(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $2, %xmm2, %xmm1 - palignr $2, %xmm3, %xmm2 - palignr $2, %xmm4, %xmm3 - palignr $2, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_2_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_2_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_3: - lea (.Lshl_3_loop_L1-.Lshl_3)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x03(%rsi), %xmm1 - jb .LL3_fwd - lea (.Lshl_3_loop_L2-.Lshl_3_loop_L1)(%r9), %r9 -.LL3_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_3_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_3_loop_L1: - sub $64, %rdx - movaps 0x0d(%rsi), %xmm2 - movaps 0x1d(%rsi), %xmm3 - movaps 0x2d(%rsi), %xmm4 - movaps 0x3d(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $3, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $3, %xmm3, %xmm4 - palignr $3, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $3, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_3_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_3_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_3_bwd: - lea (.Lshl_3_bwd_loop_L1-.Lshl_3_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x03(%rsi), %xmm1 - jb .LL3_bwd - lea (.Lshl_3_bwd_loop_L2-.Lshl_3_bwd_loop_L1)(%r9), %r9 -.LL3_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_3_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_3_bwd_loop_L1: - movaps -0x13(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x23(%rsi), %xmm3 - movaps -0x33(%rsi), %xmm4 - movaps -0x43(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $3, %xmm2, %xmm1 - palignr $3, %xmm3, %xmm2 - palignr $3, %xmm4, %xmm3 - palignr $3, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_3_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_3_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_4: - lea (.Lshl_4_loop_L1-.Lshl_4)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x04(%rsi), %xmm1 - jb .LL4_fwd - lea (.Lshl_4_loop_L2-.Lshl_4_loop_L1)(%r9), %r9 -.LL4_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_4_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_4_loop_L1: - sub $64, %rdx - movaps 0x0c(%rsi), %xmm2 - movaps 0x1c(%rsi), %xmm3 - movaps 0x2c(%rsi), %xmm4 - movaps 0x3c(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $4, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $4, %xmm3, %xmm4 - palignr $4, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $4, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_4_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_4_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_4_bwd: - lea (.Lshl_4_bwd_loop_L1-.Lshl_4_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x04(%rsi), %xmm1 - jb .LL4_bwd - lea (.Lshl_4_bwd_loop_L2-.Lshl_4_bwd_loop_L1)(%r9), %r9 -.LL4_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_4_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_4_bwd_loop_L1: - movaps -0x14(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x24(%rsi), %xmm3 - movaps -0x34(%rsi), %xmm4 - movaps -0x44(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $4, %xmm2, %xmm1 - palignr $4, %xmm3, %xmm2 - palignr $4, %xmm4, %xmm3 - palignr $4, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_4_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_4_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_5: - lea (.Lshl_5_loop_L1-.Lshl_5)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x05(%rsi), %xmm1 - jb .LL5_fwd - lea (.Lshl_5_loop_L2-.Lshl_5_loop_L1)(%r9), %r9 -.LL5_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_5_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_5_loop_L1: - sub $64, %rdx - movaps 0x0b(%rsi), %xmm2 - movaps 0x1b(%rsi), %xmm3 - movaps 0x2b(%rsi), %xmm4 - movaps 0x3b(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $5, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $5, %xmm3, %xmm4 - palignr $5, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $5, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_5_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_5_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_5_bwd: - lea (.Lshl_5_bwd_loop_L1-.Lshl_5_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x05(%rsi), %xmm1 - jb .LL5_bwd - lea (.Lshl_5_bwd_loop_L2-.Lshl_5_bwd_loop_L1)(%r9), %r9 -.LL5_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_5_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_5_bwd_loop_L1: - movaps -0x15(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x25(%rsi), %xmm3 - movaps -0x35(%rsi), %xmm4 - movaps -0x45(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $5, %xmm2, %xmm1 - palignr $5, %xmm3, %xmm2 - palignr $5, %xmm4, %xmm3 - palignr $5, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_5_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_5_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_6: - lea (.Lshl_6_loop_L1-.Lshl_6)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x06(%rsi), %xmm1 - jb .LL6_fwd - lea (.Lshl_6_loop_L2-.Lshl_6_loop_L1)(%r9), %r9 -.LL6_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_6_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_6_loop_L1: - sub $64, %rdx - movaps 0x0a(%rsi), %xmm2 - movaps 0x1a(%rsi), %xmm3 - movaps 0x2a(%rsi), %xmm4 - movaps 0x3a(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $6, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $6, %xmm3, %xmm4 - palignr $6, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $6, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_6_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_6_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_6_bwd: - lea (.Lshl_6_bwd_loop_L1-.Lshl_6_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x06(%rsi), %xmm1 - jb .LL6_bwd - lea (.Lshl_6_bwd_loop_L2-.Lshl_6_bwd_loop_L1)(%r9), %r9 -.LL6_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_6_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_6_bwd_loop_L1: - movaps -0x16(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x26(%rsi), %xmm3 - movaps -0x36(%rsi), %xmm4 - movaps -0x46(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $6, %xmm2, %xmm1 - palignr $6, %xmm3, %xmm2 - palignr $6, %xmm4, %xmm3 - palignr $6, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_6_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_6_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_7: - lea (.Lshl_7_loop_L1-.Lshl_7)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x07(%rsi), %xmm1 - jb .LL7_fwd - lea (.Lshl_7_loop_L2-.Lshl_7_loop_L1)(%r9), %r9 -.LL7_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_7_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_7_loop_L1: - sub $64, %rdx - movaps 0x09(%rsi), %xmm2 - movaps 0x19(%rsi), %xmm3 - movaps 0x29(%rsi), %xmm4 - movaps 0x39(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $7, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $7, %xmm3, %xmm4 - palignr $7, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $7, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_7_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_7_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_7_bwd: - lea (.Lshl_7_bwd_loop_L1-.Lshl_7_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x07(%rsi), %xmm1 - jb .LL7_bwd - lea (.Lshl_7_bwd_loop_L2-.Lshl_7_bwd_loop_L1)(%r9), %r9 -.LL7_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_7_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_7_bwd_loop_L1: - movaps -0x17(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x27(%rsi), %xmm3 - movaps -0x37(%rsi), %xmm4 - movaps -0x47(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $7, %xmm2, %xmm1 - palignr $7, %xmm3, %xmm2 - palignr $7, %xmm4, %xmm3 - palignr $7, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_7_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_7_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_8: - lea (.Lshl_8_loop_L1-.Lshl_8)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x08(%rsi), %xmm1 - jb .LL8_fwd - lea (.Lshl_8_loop_L2-.Lshl_8_loop_L1)(%r9), %r9 -.LL8_fwd: - lea -64(%rdx), %rdx - jmp *%r9 -.Lshl_8_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_8_loop_L1: - sub $64, %rdx - movaps 0x08(%rsi), %xmm2 - movaps 0x18(%rsi), %xmm3 - movaps 0x28(%rsi), %xmm4 - movaps 0x38(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $8, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $8, %xmm3, %xmm4 - palignr $8, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $8, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_8_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 - .p2align 4 -.Lshl_8_end: - lea 64(%rdx), %rdx - movaps %xmm4, -0x20(%rdi) - add %rdx, %rsi - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_8_bwd: - lea (.Lshl_8_bwd_loop_L1-.Lshl_8_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x08(%rsi), %xmm1 - jb .LL8_bwd - lea (.Lshl_8_bwd_loop_L2-.Lshl_8_bwd_loop_L1)(%r9), %r9 -.LL8_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_8_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_8_bwd_loop_L1: - movaps -0x18(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x28(%rsi), %xmm3 - movaps -0x38(%rsi), %xmm4 - movaps -0x48(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $8, %xmm2, %xmm1 - palignr $8, %xmm3, %xmm2 - palignr $8, %xmm4, %xmm3 - palignr $8, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_8_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_8_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_9: - lea (.Lshl_9_loop_L1-.Lshl_9)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x09(%rsi), %xmm1 - jb .LL9_fwd - lea (.Lshl_9_loop_L2-.Lshl_9_loop_L1)(%r9), %r9 -.LL9_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_9_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_9_loop_L1: - sub $64, %rdx - movaps 0x07(%rsi), %xmm2 - movaps 0x17(%rsi), %xmm3 - movaps 0x27(%rsi), %xmm4 - movaps 0x37(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $9, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $9, %xmm3, %xmm4 - palignr $9, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $9, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_9_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_9_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_9_bwd: - lea (.Lshl_9_bwd_loop_L1-.Lshl_9_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x09(%rsi), %xmm1 - jb .LL9_bwd - lea (.Lshl_9_bwd_loop_L2-.Lshl_9_bwd_loop_L1)(%r9), %r9 -.LL9_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_9_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_9_bwd_loop_L1: - movaps -0x19(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x29(%rsi), %xmm3 - movaps -0x39(%rsi), %xmm4 - movaps -0x49(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $9, %xmm2, %xmm1 - palignr $9, %xmm3, %xmm2 - palignr $9, %xmm4, %xmm3 - palignr $9, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_9_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_9_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_10: - lea (.Lshl_10_loop_L1-.Lshl_10)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0a(%rsi), %xmm1 - jb .LL10_fwd - lea (.Lshl_10_loop_L2-.Lshl_10_loop_L1)(%r9), %r9 -.LL10_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_10_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_10_loop_L1: - sub $64, %rdx - movaps 0x06(%rsi), %xmm2 - movaps 0x16(%rsi), %xmm3 - movaps 0x26(%rsi), %xmm4 - movaps 0x36(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $10, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $10, %xmm3, %xmm4 - palignr $10, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $10, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_10_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_10_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_10_bwd: - lea (.Lshl_10_bwd_loop_L1-.Lshl_10_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0a(%rsi), %xmm1 - jb .LL10_bwd - lea (.Lshl_10_bwd_loop_L2-.Lshl_10_bwd_loop_L1)(%r9), %r9 -.LL10_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_10_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_10_bwd_loop_L1: - movaps -0x1a(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2a(%rsi), %xmm3 - movaps -0x3a(%rsi), %xmm4 - movaps -0x4a(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $10, %xmm2, %xmm1 - palignr $10, %xmm3, %xmm2 - palignr $10, %xmm4, %xmm3 - palignr $10, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_10_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_10_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_11: - lea (.Lshl_11_loop_L1-.Lshl_11)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0b(%rsi), %xmm1 - jb .LL11_fwd - lea (.Lshl_11_loop_L2-.Lshl_11_loop_L1)(%r9), %r9 -.LL11_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_11_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_11_loop_L1: - sub $64, %rdx - movaps 0x05(%rsi), %xmm2 - movaps 0x15(%rsi), %xmm3 - movaps 0x25(%rsi), %xmm4 - movaps 0x35(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $11, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $11, %xmm3, %xmm4 - palignr $11, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $11, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_11_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_11_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_11_bwd: - lea (.Lshl_11_bwd_loop_L1-.Lshl_11_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0b(%rsi), %xmm1 - jb .LL11_bwd - lea (.Lshl_11_bwd_loop_L2-.Lshl_11_bwd_loop_L1)(%r9), %r9 -.LL11_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_11_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_11_bwd_loop_L1: - movaps -0x1b(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2b(%rsi), %xmm3 - movaps -0x3b(%rsi), %xmm4 - movaps -0x4b(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $11, %xmm2, %xmm1 - palignr $11, %xmm3, %xmm2 - palignr $11, %xmm4, %xmm3 - palignr $11, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_11_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_11_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_12: - lea (.Lshl_12_loop_L1-.Lshl_12)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0c(%rsi), %xmm1 - jb .LL12_fwd - lea (.Lshl_12_loop_L2-.Lshl_12_loop_L1)(%r9), %r9 -.LL12_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_12_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_12_loop_L1: - sub $64, %rdx - movaps 0x04(%rsi), %xmm2 - movaps 0x14(%rsi), %xmm3 - movaps 0x24(%rsi), %xmm4 - movaps 0x34(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $12, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $12, %xmm3, %xmm4 - palignr $12, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $12, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_12_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_12_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_12_bwd: - lea (.Lshl_12_bwd_loop_L1-.Lshl_12_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0c(%rsi), %xmm1 - jb .LL12_bwd - lea (.Lshl_12_bwd_loop_L2-.Lshl_12_bwd_loop_L1)(%r9), %r9 -.LL12_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_12_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_12_bwd_loop_L1: - movaps -0x1c(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2c(%rsi), %xmm3 - movaps -0x3c(%rsi), %xmm4 - movaps -0x4c(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $12, %xmm2, %xmm1 - palignr $12, %xmm3, %xmm2 - palignr $12, %xmm4, %xmm3 - palignr $12, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_12_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_12_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_13: - lea (.Lshl_13_loop_L1-.Lshl_13)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0d(%rsi), %xmm1 - jb .LL13_fwd - lea (.Lshl_13_loop_L2-.Lshl_13_loop_L1)(%r9), %r9 -.LL13_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_13_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_13_loop_L1: - sub $64, %rdx - movaps 0x03(%rsi), %xmm2 - movaps 0x13(%rsi), %xmm3 - movaps 0x23(%rsi), %xmm4 - movaps 0x33(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $13, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $13, %xmm3, %xmm4 - palignr $13, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $13, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_13_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_13_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_13_bwd: - lea (.Lshl_13_bwd_loop_L1-.Lshl_13_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0d(%rsi), %xmm1 - jb .LL13_bwd - lea (.Lshl_13_bwd_loop_L2-.Lshl_13_bwd_loop_L1)(%r9), %r9 -.LL13_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_13_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_13_bwd_loop_L1: - movaps -0x1d(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2d(%rsi), %xmm3 - movaps -0x3d(%rsi), %xmm4 - movaps -0x4d(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $13, %xmm2, %xmm1 - palignr $13, %xmm3, %xmm2 - palignr $13, %xmm4, %xmm3 - palignr $13, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_13_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_13_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_14: - lea (.Lshl_14_loop_L1-.Lshl_14)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0e(%rsi), %xmm1 - jb .LL14_fwd - lea (.Lshl_14_loop_L2-.Lshl_14_loop_L1)(%r9), %r9 -.LL14_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_14_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_14_loop_L1: - sub $64, %rdx - movaps 0x02(%rsi), %xmm2 - movaps 0x12(%rsi), %xmm3 - movaps 0x22(%rsi), %xmm4 - movaps 0x32(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $14, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $14, %xmm3, %xmm4 - palignr $14, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $14, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_14_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_14_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_14_bwd: - lea (.Lshl_14_bwd_loop_L1-.Lshl_14_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0e(%rsi), %xmm1 - jb .LL14_bwd - lea (.Lshl_14_bwd_loop_L2-.Lshl_14_bwd_loop_L1)(%r9), %r9 -.LL14_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_14_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_14_bwd_loop_L1: - movaps -0x1e(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2e(%rsi), %xmm3 - movaps -0x3e(%rsi), %xmm4 - movaps -0x4e(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $14, %xmm2, %xmm1 - palignr $14, %xmm3, %xmm2 - palignr $14, %xmm4, %xmm3 - palignr $14, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_14_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_14_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_15: - lea (.Lshl_15_loop_L1-.Lshl_15)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0f(%rsi), %xmm1 - jb .LL15_fwd - lea (.Lshl_15_loop_L2-.Lshl_15_loop_L1)(%r9), %r9 -.LL15_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_15_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_15_loop_L1: - sub $64, %rdx - movaps 0x01(%rsi), %xmm2 - movaps 0x11(%rsi), %xmm3 - movaps 0x21(%rsi), %xmm4 - movaps 0x31(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $15, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $15, %xmm3, %xmm4 - palignr $15, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $15, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_15_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_15_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_15_bwd: - lea (.Lshl_15_bwd_loop_L1-.Lshl_15_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0f(%rsi), %xmm1 - jb .LL15_bwd - lea (.Lshl_15_bwd_loop_L2-.Lshl_15_bwd_loop_L1)(%r9), %r9 -.LL15_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_15_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_15_bwd_loop_L1: - movaps -0x1f(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2f(%rsi), %xmm3 - movaps -0x3f(%rsi), %xmm4 - movaps -0x4f(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $15, %xmm2, %xmm1 - palignr $15, %xmm3, %xmm2 - palignr $15, %xmm4, %xmm3 - palignr $15, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_15_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_15_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lwrite_72bytes: - movdqu -72(%rsi), %xmm0 - movdqu -56(%rsi), %xmm1 - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rcx - movdqu %xmm0, -72(%rdi) - movdqu %xmm1, -56(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_64bytes: - movdqu -64(%rsi), %xmm0 - mov -48(%rsi), %rcx - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - movdqu %xmm0, -64(%rdi) - mov %rcx, -48(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_56bytes: - movdqu -56(%rsi), %xmm0 - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rcx - movdqu %xmm0, -56(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_48bytes: - mov -48(%rsi), %rcx - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %rcx, -48(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_40bytes: - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_32bytes: - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_24bytes: - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_16bytes: - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_8bytes: - mov -8(%rsi), %rdx - mov %rdx, -8(%rdi) -.Lwrite_0bytes: - ret - - .p2align 4 -.Lwrite_73bytes: - movdqu -73(%rsi), %xmm0 - movdqu -57(%rsi), %xmm1 - mov -41(%rsi), %rcx - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %r8 - mov -4(%rsi), %edx - movdqu %xmm0, -73(%rdi) - movdqu %xmm1, -57(%rdi) - mov %rcx, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %r8, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_65bytes: - movdqu -65(%rsi), %xmm0 - movdqu -49(%rsi), %xmm1 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -65(%rdi) - movdqu %xmm1, -49(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_57bytes: - movdqu -57(%rsi), %xmm0 - mov -41(%rsi), %r8 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -57(%rdi) - mov %r8, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_49bytes: - movdqu -49(%rsi), %xmm0 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -49(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_41bytes: - mov -41(%rsi), %r8 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r8, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -.Lwrite_33bytes: - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -.Lwrite_25bytes: - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -.Lwrite_17bytes: - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_9bytes: - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_1bytes: - mov -1(%rsi), %dl - mov %dl, -1(%rdi) - ret - - .p2align 4 -.Lwrite_74bytes: - movdqu -74(%rsi), %xmm0 - movdqu -58(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -74(%rdi) - movdqu %xmm1, -58(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_66bytes: - movdqu -66(%rsi), %xmm0 - movdqu -50(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -66(%rdi) - movdqu %xmm1, -50(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_58bytes: - movdqu -58(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm1, -58(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_50bytes: - movdqu -50(%rsi), %xmm0 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -50(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_42bytes: - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_34bytes: - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_26bytes: - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_18bytes: - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_10bytes: - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_2bytes: - mov -2(%rsi), %dx - mov %dx, -2(%rdi) - ret - - .p2align 4 -.Lwrite_75bytes: - movdqu -75(%rsi), %xmm0 - movdqu -59(%rsi), %xmm1 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -75(%rdi) - movdqu %xmm1, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_67bytes: - movdqu -67(%rsi), %xmm0 - movdqu -59(%rsi), %xmm1 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -67(%rdi) - movdqu %xmm1, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_59bytes: - movdqu -59(%rsi), %xmm0 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_51bytes: - movdqu -51(%rsi), %xmm0 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -51(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_43bytes: - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_35bytes: - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_27bytes: - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_19bytes: - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_11bytes: - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_3bytes: - mov -3(%rsi), %dx - mov -2(%rsi), %cx - mov %dx, -3(%rdi) - mov %cx, -2(%rdi) - ret - - .p2align 4 -.Lwrite_76bytes: - movdqu -76(%rsi), %xmm0 - movdqu -60(%rsi), %xmm1 - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -76(%rdi) - movdqu %xmm1, -60(%rdi) - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_68bytes: - movdqu -68(%rsi), %xmm0 - movdqu -52(%rsi), %xmm1 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -68(%rdi) - movdqu %xmm1, -52(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_60bytes: - movdqu -60(%rsi), %xmm0 - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -60(%rdi) - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_52bytes: - movdqu -52(%rsi), %xmm0 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -52(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_44bytes: - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_36bytes: - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_28bytes: - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_20bytes: - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_12bytes: - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_4bytes: - mov -4(%rsi), %edx - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_77bytes: - movdqu -77(%rsi), %xmm0 - movdqu -61(%rsi), %xmm1 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -77(%rdi) - movdqu %xmm1, -61(%rdi) - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_69bytes: - movdqu -69(%rsi), %xmm0 - movdqu -53(%rsi), %xmm1 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -69(%rdi) - movdqu %xmm1, -53(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_61bytes: - movdqu -61(%rsi), %xmm0 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -61(%rdi) - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_53bytes: - movdqu -53(%rsi), %xmm0 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -53(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_45bytes: - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_37bytes: - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_29bytes: - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_21bytes: - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_13bytes: - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_5bytes: - mov -5(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -5(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_78bytes: - movdqu -78(%rsi), %xmm0 - movdqu -62(%rsi), %xmm1 - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -78(%rdi) - movdqu %xmm1, -62(%rdi) - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_70bytes: - movdqu -70(%rsi), %xmm0 - movdqu -54(%rsi), %xmm1 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -70(%rdi) - movdqu %xmm1, -54(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_62bytes: - movdqu -62(%rsi), %xmm0 - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -62(%rdi) - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_54bytes: - movdqu -54(%rsi), %xmm0 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -54(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_46bytes: - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_38bytes: - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_30bytes: - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_22bytes: - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_14bytes: - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_6bytes: - mov -6(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -6(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_79bytes: - movdqu -79(%rsi), %xmm0 - movdqu -63(%rsi), %xmm1 - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -79(%rdi) - movdqu %xmm1, -63(%rdi) - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_71bytes: - movdqu -71(%rsi), %xmm0 - movdqu -55(%rsi), %xmm1 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -71(%rdi) - movdqu %xmm1, -55(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_63bytes: - movdqu -63(%rsi), %xmm0 - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -63(%rdi) - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_55bytes: - movdqu -55(%rsi), %xmm0 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -55(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_47bytes: - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_39bytes: - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_31bytes: - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_23bytes: - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_15bytes: - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_7bytes: - mov -7(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -7(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -.Llarge_page_fwd: - movdqu (%rsi), %xmm1 - lea 16(%rsi), %rsi - movdqu %xmm0, (%r8) - movntdq %xmm1, (%rdi) - lea 16(%rdi), %rdi - lea -0x90(%rdx), %rdx -.Llarge_page_loop: - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - - sub $0x80, %rdx - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - movntdq %xmm4, 0x40(%rdi) - movntdq %xmm5, 0x50(%rdi) - movntdq %xmm6, 0x60(%rdi) - movntdq %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - jae .Llarge_page_loop - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl .Llarge_page_less_64bytes - - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - lea 0x40(%rsi), %rsi - - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - lea 0x40(%rdi), %rdi - sub $0x40, %rdx -.Llarge_page_less_64bytes: - add %rdx, %rsi - add %rdx, %rdi - sfence - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - .p2align 4 -.Llarge_page_bwd: - movdqu -0x10(%rsi), %xmm1 - lea -16(%rsi), %rsi - movdqu %xmm0, (%r8) - movdqa %xmm1, -0x10(%rdi) - lea -16(%rdi), %rdi - lea -0x90(%rdx), %rdx -.Llarge_page_bwd_loop: - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - movdqu -0x50(%rsi), %xmm4 - movdqu -0x60(%rsi), %xmm5 - movdqu -0x70(%rsi), %xmm6 - movdqu -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - - sub $0x80, %rdx - movntdq %xmm0, -0x10(%rdi) - movntdq %xmm1, -0x20(%rdi) - movntdq %xmm2, -0x30(%rdi) - movntdq %xmm3, -0x40(%rdi) - movntdq %xmm4, -0x50(%rdi) - movntdq %xmm5, -0x60(%rdi) - movntdq %xmm6, -0x70(%rdi) - movntdq %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - jae .Llarge_page_bwd_loop - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl .Llarge_page_less_bwd_64bytes - - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - lea -0x40(%rsi), %rsi - - movntdq %xmm0, -0x10(%rdi) - movntdq %xmm1, -0x20(%rdi) - movntdq %xmm2, -0x30(%rdi) - movntdq %xmm3, -0x40(%rdi) - lea -0x40(%rdi), %rdi - sub $0x40, %rdx -.Llarge_page_less_bwd_64bytes: - sfence - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .section .rodata.ssse3 - .p2align 3 -.Ltable_less_80bytes: - .int .Lwrite_0bytes - .Ltable_less_80bytes - .int .Lwrite_1bytes - .Ltable_less_80bytes - .int .Lwrite_2bytes - .Ltable_less_80bytes - .int .Lwrite_3bytes - .Ltable_less_80bytes - .int .Lwrite_4bytes - .Ltable_less_80bytes - .int .Lwrite_5bytes - .Ltable_less_80bytes - .int .Lwrite_6bytes - .Ltable_less_80bytes - .int .Lwrite_7bytes - .Ltable_less_80bytes - .int .Lwrite_8bytes - .Ltable_less_80bytes - .int .Lwrite_9bytes - .Ltable_less_80bytes - .int .Lwrite_10bytes - .Ltable_less_80bytes - .int .Lwrite_11bytes - .Ltable_less_80bytes - .int .Lwrite_12bytes - .Ltable_less_80bytes - .int .Lwrite_13bytes - .Ltable_less_80bytes - .int .Lwrite_14bytes - .Ltable_less_80bytes - .int .Lwrite_15bytes - .Ltable_less_80bytes - .int .Lwrite_16bytes - .Ltable_less_80bytes - .int .Lwrite_17bytes - .Ltable_less_80bytes - .int .Lwrite_18bytes - .Ltable_less_80bytes - .int .Lwrite_19bytes - .Ltable_less_80bytes - .int .Lwrite_20bytes - .Ltable_less_80bytes - .int .Lwrite_21bytes - .Ltable_less_80bytes - .int .Lwrite_22bytes - .Ltable_less_80bytes - .int .Lwrite_23bytes - .Ltable_less_80bytes - .int .Lwrite_24bytes - .Ltable_less_80bytes - .int .Lwrite_25bytes - .Ltable_less_80bytes - .int .Lwrite_26bytes - .Ltable_less_80bytes - .int .Lwrite_27bytes - .Ltable_less_80bytes - .int .Lwrite_28bytes - .Ltable_less_80bytes - .int .Lwrite_29bytes - .Ltable_less_80bytes - .int .Lwrite_30bytes - .Ltable_less_80bytes - .int .Lwrite_31bytes - .Ltable_less_80bytes - .int .Lwrite_32bytes - .Ltable_less_80bytes - .int .Lwrite_33bytes - .Ltable_less_80bytes - .int .Lwrite_34bytes - .Ltable_less_80bytes - .int .Lwrite_35bytes - .Ltable_less_80bytes - .int .Lwrite_36bytes - .Ltable_less_80bytes - .int .Lwrite_37bytes - .Ltable_less_80bytes - .int .Lwrite_38bytes - .Ltable_less_80bytes - .int .Lwrite_39bytes - .Ltable_less_80bytes - .int .Lwrite_40bytes - .Ltable_less_80bytes - .int .Lwrite_41bytes - .Ltable_less_80bytes - .int .Lwrite_42bytes - .Ltable_less_80bytes - .int .Lwrite_43bytes - .Ltable_less_80bytes - .int .Lwrite_44bytes - .Ltable_less_80bytes - .int .Lwrite_45bytes - .Ltable_less_80bytes - .int .Lwrite_46bytes - .Ltable_less_80bytes - .int .Lwrite_47bytes - .Ltable_less_80bytes - .int .Lwrite_48bytes - .Ltable_less_80bytes - .int .Lwrite_49bytes - .Ltable_less_80bytes - .int .Lwrite_50bytes - .Ltable_less_80bytes - .int .Lwrite_51bytes - .Ltable_less_80bytes - .int .Lwrite_52bytes - .Ltable_less_80bytes - .int .Lwrite_53bytes - .Ltable_less_80bytes - .int .Lwrite_54bytes - .Ltable_less_80bytes - .int .Lwrite_55bytes - .Ltable_less_80bytes - .int .Lwrite_56bytes - .Ltable_less_80bytes - .int .Lwrite_57bytes - .Ltable_less_80bytes - .int .Lwrite_58bytes - .Ltable_less_80bytes - .int .Lwrite_59bytes - .Ltable_less_80bytes - .int .Lwrite_60bytes - .Ltable_less_80bytes - .int .Lwrite_61bytes - .Ltable_less_80bytes - .int .Lwrite_62bytes - .Ltable_less_80bytes - .int .Lwrite_63bytes - .Ltable_less_80bytes - .int .Lwrite_64bytes - .Ltable_less_80bytes - .int .Lwrite_65bytes - .Ltable_less_80bytes - .int .Lwrite_66bytes - .Ltable_less_80bytes - .int .Lwrite_67bytes - .Ltable_less_80bytes - .int .Lwrite_68bytes - .Ltable_less_80bytes - .int .Lwrite_69bytes - .Ltable_less_80bytes - .int .Lwrite_70bytes - .Ltable_less_80bytes - .int .Lwrite_71bytes - .Ltable_less_80bytes - .int .Lwrite_72bytes - .Ltable_less_80bytes - .int .Lwrite_73bytes - .Ltable_less_80bytes - .int .Lwrite_74bytes - .Ltable_less_80bytes - .int .Lwrite_75bytes - .Ltable_less_80bytes - .int .Lwrite_76bytes - .Ltable_less_80bytes - .int .Lwrite_77bytes - .Ltable_less_80bytes - .int .Lwrite_78bytes - .Ltable_less_80bytes - .int .Lwrite_79bytes - .Ltable_less_80bytes - - .p2align 3 -.Lshl_table: - .int .Lshl_0 - .Lshl_table - .int .Lshl_1 - .Lshl_table - .int .Lshl_2 - .Lshl_table - .int .Lshl_3 - .Lshl_table - .int .Lshl_4 - .Lshl_table - .int .Lshl_5 - .Lshl_table - .int .Lshl_6 - .Lshl_table - .int .Lshl_7 - .Lshl_table - .int .Lshl_8 - .Lshl_table - .int .Lshl_9 - .Lshl_table - .int .Lshl_10 - .Lshl_table - .int .Lshl_11 - .Lshl_table - .int .Lshl_12 - .Lshl_table - .int .Lshl_13 - .Lshl_table - .int .Lshl_14 - .Lshl_table - .int .Lshl_15 - .Lshl_table - - .p2align 3 -.Lshl_table_bwd: - .int .Lshl_0_bwd - .Lshl_table_bwd - .int .Lshl_1_bwd - .Lshl_table_bwd - .int .Lshl_2_bwd - .Lshl_table_bwd - .int .Lshl_3_bwd - .Lshl_table_bwd - .int .Lshl_4_bwd - .Lshl_table_bwd - .int .Lshl_5_bwd - .Lshl_table_bwd - .int .Lshl_6_bwd - .Lshl_table_bwd - .int .Lshl_7_bwd - .Lshl_table_bwd - .int .Lshl_8_bwd - .Lshl_table_bwd - .int .Lshl_9_bwd - .Lshl_table_bwd - .int .Lshl_10_bwd - .Lshl_table_bwd - .int .Lshl_11_bwd - .Lshl_table_bwd - .int .Lshl_12_bwd - .Lshl_table_bwd - .int .Lshl_13_bwd - .Lshl_table_bwd - .int .Lshl_14_bwd - .Lshl_table_bwd - .int .Lshl_15_bwd - .Lshl_table_bwd diff --git a/impls/memmove-avx-unaligned-erms-rtm.s b/impls/memmove-avx-unaligned-erms-rtm.s index 1d970ad..7c9eb27 100644 --- a/impls/memmove-avx-unaligned-erms-rtm.s +++ b/impls/memmove-avx-unaligned-erms-rtm.s @@ -22,7 +22,7 @@ __memmove_avx_unaligned_rtm: vmovdqu %ymm0, (%rdi) vmovdqu %ymm1, -32(%rdi,%rdx) - jmp .Lreturn + jmp .Lreturn_vzeroupper .globl __mempcpy_avx_unaligned_erms_rtm __mempcpy_avx_unaligned_erms_rtm: @@ -45,9 +45,9 @@ __memmove_avx_unaligned_erms_rtm: vmovdqu -32(%rsi, %rdx), %ymm1 vmovdqu %ymm0, (%rdi) vmovdqu %ymm1, -32(%rdi, %rdx) -.Lreturn: +.Lreturn_vzeroupper: - xtest; jz 1f; vzeroall; ret; 1: vzeroupper; ret + xtest; jnz 1f; vzeroupper; ret; 1: vzeroall; ret .p2align 4 .Lless_vec: cmpl $16, %edx @@ -105,7 +105,7 @@ __memmove_avx_unaligned_erms_rtm: vmovdqu %ymm1, 32(%rdi) vmovdqu %ymm2, -32(%rdi, %rdx) vmovdqu %ymm3, -(32 * 2)(%rdi, %rdx) - jmp .Lreturn + jmp .Lreturn_vzeroupper .p2align 4 @@ -136,7 +136,7 @@ __memmove_avx_unaligned_erms_rtm: vmovdqu %ymm5, -(32 * 2)(%rdi, %rdx) vmovdqu %ymm6, -(32 * 3)(%rdi, %rdx) vmovdqu %ymm7, -(32 * 4)(%rdi, %rdx) - jmp .Lreturn + jmp .Lreturn_vzeroupper .p2align 4,, 4 .Lmore_8x_vec: @@ -205,7 +205,7 @@ __memmove_avx_unaligned_erms_rtm: vmovdqu %ymm0, (%rcx) .Lnop_backward: - jmp .Lreturn + jmp .Lreturn_vzeroupper .p2align 4,, 8 .Lmore_8x_vec_backward_check_nop: @@ -249,7 +249,7 @@ __memmove_avx_unaligned_erms_rtm: vmovdqu %ymm7, (32 * 3)(%rdi) vmovdqu %ymm8, -32(%rdx, %rdi) - jmp .Lreturn + jmp .Lreturn_vzeroupper .p2align 5,, 16 @@ -277,7 +277,7 @@ __memmove_avx_unaligned_erms_rtm: vmovdqu %ymm1, 32(%r8) - jmp .Lreturn + jmp .Lreturn_vzeroupper .p2align 4,, 12 .Lmovsb: @@ -293,7 +293,8 @@ __memmove_avx_unaligned_erms_rtm: cmp __x86_rep_movsb_stop_threshold(%rip), %rdx jae .Llarge_memcpy_2x_check - testl $(1 << 0), __x86_string_control(%rip) + testb $(1 << 0), __x86_string_control(%rip) + jz .Lskip_short_movsb_check cmpl $-64, %ecx ja .Lmore_8x_vec_forward @@ -319,13 +320,15 @@ __memmove_avx_unaligned_erms_rtm: vmovdqu %ymm1, 32(%r8) - jmp .Lreturn + jmp .Lreturn_vzeroupper .p2align 4,, 10 .Llarge_memcpy_2x_check: - cmp __x86_rep_movsb_threshold(%rip), %rdx - jb .Lmore_8x_vec_check + .Llarge_memcpy_2x: + mov __x86_shared_non_temporal_threshold(%rip), %r11 + cmp %r11, %rdx + jb .Lmore_8x_vec_check negq %rcx cmpq %rcx, %rdx @@ -349,17 +352,17 @@ __memmove_avx_unaligned_erms_rtm: addq %r8, %rdx notl %ecx + movq %rdx, %r10 testl $(4096 - 32 * 8), %ecx jz .Llarge_memcpy_4x - movq %rdx, %r10 - shrq $4, %r10 - cmp __x86_shared_non_temporal_threshold(%rip), %r10 + shlq $4, %r11 + cmp %r11, %rdx jae .Llarge_memcpy_4x andl $(4096 * 2 - 1), %edx - shrq $((12 + 1) - 4), %r10 + shrq $(12 + 1), %r10 .p2align 4 .Lloop_large_memcpy_2x_outer: @@ -418,11 +421,10 @@ __memmove_avx_unaligned_erms_rtm: vmovdqu %ymm1, -(32 * 3)(%rdi, %rdx) vmovdqu %ymm2, -(32 * 2)(%rdi, %rdx) vmovdqu %ymm3, -32(%rdi, %rdx) - jmp .Lreturn + jmp .Lreturn_vzeroupper .p2align 4 .Llarge_memcpy_4x: - movq %rdx, %r10 andl $(4096 * 4 - 1), %edx @@ -490,7 +492,7 @@ __memmove_avx_unaligned_erms_rtm: vmovdqu %ymm1, -(32 * 3)(%rdi, %rdx) vmovdqu %ymm2, -(32 * 2)(%rdi, %rdx) vmovdqu %ymm3, -32(%rdi, %rdx) - jmp .Lreturn + jmp .Lreturn_vzeroupper .globl __memcpy_avx_unaligned_erms_rtm .set __memcpy_avx_unaligned_erms_rtm, __memmove_avx_unaligned_erms_rtm diff --git a/impls/memmove-avx-unaligned-erms.s b/impls/memmove-avx-unaligned-erms.s index 4574370..40a2ee3 100644 --- a/impls/memmove-avx-unaligned-erms.s +++ b/impls/memmove-avx-unaligned-erms.s @@ -45,7 +45,7 @@ __memmove_avx_unaligned_erms: vmovdqu -32(%rsi, %rdx), %ymm1 vmovdqu %ymm0, (%rdi) vmovdqu %ymm1, -32(%rdi, %rdx) -.Lreturn: +.Lreturn_vzeroupper: vzeroupper; ret .p2align 4 @@ -293,7 +293,8 @@ __memmove_avx_unaligned_erms: cmp __x86_rep_movsb_stop_threshold(%rip), %rdx jae .Llarge_memcpy_2x_check - testl $(1 << 0), __x86_string_control(%rip) + testb $(1 << 0), __x86_string_control(%rip) + jz .Lskip_short_movsb_check cmpl $-64, %ecx ja .Lmore_8x_vec_forward @@ -323,9 +324,11 @@ __memmove_avx_unaligned_erms: .p2align 4,, 10 .Llarge_memcpy_2x_check: - cmp __x86_rep_movsb_threshold(%rip), %rdx - jb .Lmore_8x_vec_check + .Llarge_memcpy_2x: + mov __x86_shared_non_temporal_threshold(%rip), %r11 + cmp %r11, %rdx + jb .Lmore_8x_vec_check negq %rcx cmpq %rcx, %rdx @@ -349,17 +352,17 @@ __memmove_avx_unaligned_erms: addq %r8, %rdx notl %ecx + movq %rdx, %r10 testl $(4096 - 32 * 8), %ecx jz .Llarge_memcpy_4x - movq %rdx, %r10 - shrq $4, %r10 - cmp __x86_shared_non_temporal_threshold(%rip), %r10 + shlq $4, %r11 + cmp %r11, %rdx jae .Llarge_memcpy_4x andl $(4096 * 2 - 1), %edx - shrq $((12 + 1) - 4), %r10 + shrq $(12 + 1), %r10 .p2align 4 .Lloop_large_memcpy_2x_outer: @@ -422,7 +425,6 @@ __memmove_avx_unaligned_erms: .p2align 4 .Llarge_memcpy_4x: - movq %rdx, %r10 andl $(4096 * 4 - 1), %edx diff --git a/impls/memmove-avx512-unaligned-erms.s b/impls/memmove-avx512-unaligned-erms.s index 4316724..2c8d68f 100644 --- a/impls/memmove-avx512-unaligned-erms.s +++ b/impls/memmove-avx512-unaligned-erms.s @@ -45,7 +45,7 @@ __memmove_avx512_unaligned_erms: vmovdqu64 -64(%rsi, %rdx), %zmm17 vmovdqu64 %zmm16, (%rdi) vmovdqu64 %zmm17, -64(%rdi, %rdx) -.Lreturn: +.Lreturn_vzeroupper: ; ret .p2align 4,, 8 @@ -301,7 +301,8 @@ __memmove_avx512_unaligned_erms: cmp __x86_rep_movsb_stop_threshold(%rip), %rdx jae .Llarge_memcpy_2x_check - testl $(1 << 0), __x86_string_control(%rip) + testb $(1 << 0), __x86_string_control(%rip) + jz .Lskip_short_movsb_check cmpl $-64, %ecx ja .Lmore_8x_vec_forward @@ -326,9 +327,11 @@ __memmove_avx512_unaligned_erms: .p2align 4,, 10 .Llarge_memcpy_2x_check: - cmp __x86_rep_movsb_threshold(%rip), %rdx - jb .Lmore_8x_vec_check + .Llarge_memcpy_2x: + mov __x86_shared_non_temporal_threshold(%rip), %r11 + cmp %r11, %rdx + jb .Lmore_8x_vec_check negq %rcx cmpq %rcx, %rdx @@ -346,17 +349,17 @@ __memmove_avx512_unaligned_erms: addq %r8, %rdx notl %ecx + movq %rdx, %r10 testl $(4096 - 64 * 8), %ecx jz .Llarge_memcpy_4x - movq %rdx, %r10 - shrq $4, %r10 - cmp __x86_shared_non_temporal_threshold(%rip), %r10 + shlq $4, %r11 + cmp %r11, %rdx jae .Llarge_memcpy_4x andl $(4096 * 2 - 1), %edx - shrq $((12 + 1) - 4), %r10 + shrq $(12 + 1), %r10 .p2align 4 .Lloop_large_memcpy_2x_outer: @@ -419,7 +422,6 @@ __memmove_avx512_unaligned_erms: .p2align 4 .Llarge_memcpy_4x: - movq %rdx, %r10 andl $(4096 * 4 - 1), %edx diff --git a/impls/memmove-erms.s b/impls/memmove-erms.s new file mode 100644 index 0000000..99225c7 --- /dev/null +++ b/impls/memmove-erms.s @@ -0,0 +1,38 @@ + .text +.globl __mempcpy_erms +__mempcpy_erms: + mov %rdi, %rax + + test %rdx, %rdx + jz 2f + add %rdx, %rax + jmp .Lstart_movsb + +.globl __memmove_erms +__memmove_erms: + movq %rdi, %rax + + test %rdx, %rdx + jz 2f +.Lstart_movsb: + mov %rdx, %rcx + cmp %rsi, %rdi + jb 1f + + je 2f + lea (%rsi,%rcx), %rdx + cmp %rdx, %rdi + jb .Lmovsb_backward +1: + rep movsb +2: + ret +.Lmovsb_backward: + leaq -1(%rdi,%rcx), %rdi + leaq -1(%rsi,%rcx), %rsi + std + rep movsb + cld + ret +.globl __memcpy_erms +.set __memcpy_erms, __memmove_erms diff --git a/impls/memmove-evex-unaligned-erms.s b/impls/memmove-evex-unaligned-erms.s index 967df0b..65c4740 100644 --- a/impls/memmove-evex-unaligned-erms.s +++ b/impls/memmove-evex-unaligned-erms.s @@ -45,7 +45,7 @@ __memmove_evex_unaligned_erms: vmovdqu64 -32(%rsi, %rdx), %ymm17 vmovdqu64 %ymm16, (%rdi) vmovdqu64 %ymm17, -32(%rdi, %rdx) -.Lreturn: +.Lreturn_vzeroupper: ; ret .p2align 4,, 8 @@ -292,7 +292,8 @@ __memmove_evex_unaligned_erms: cmp __x86_rep_movsb_stop_threshold(%rip), %rdx jae .Llarge_memcpy_2x_check - testl $(1 << 0), __x86_string_control(%rip) + testb $(1 << 0), __x86_string_control(%rip) + jz .Lskip_short_movsb_check cmpl $-64, %ecx ja .Lmore_8x_vec_forward @@ -322,9 +323,11 @@ __memmove_evex_unaligned_erms: .p2align 4,, 10 .Llarge_memcpy_2x_check: - cmp __x86_rep_movsb_threshold(%rip), %rdx - jb .Lmore_8x_vec_check + .Llarge_memcpy_2x: + mov __x86_shared_non_temporal_threshold(%rip), %r11 + cmp %r11, %rdx + jb .Lmore_8x_vec_check negq %rcx cmpq %rcx, %rdx @@ -348,17 +351,17 @@ __memmove_evex_unaligned_erms: addq %r8, %rdx notl %ecx + movq %rdx, %r10 testl $(4096 - 32 * 8), %ecx jz .Llarge_memcpy_4x - movq %rdx, %r10 - shrq $4, %r10 - cmp __x86_shared_non_temporal_threshold(%rip), %r10 + shlq $4, %r11 + cmp %r11, %rdx jae .Llarge_memcpy_4x andl $(4096 * 2 - 1), %edx - shrq $((12 + 1) - 4), %r10 + shrq $(12 + 1), %r10 .p2align 4 .Lloop_large_memcpy_2x_outer: @@ -421,7 +424,6 @@ __memmove_evex_unaligned_erms: .p2align 4 .Llarge_memcpy_4x: - movq %rdx, %r10 andl $(4096 * 4 - 1), %edx diff --git a/impls/memmove-sse2-unaligned-erms.s b/impls/memmove-sse2-unaligned-erms.s index 191980c..c12c2ce 100644 --- a/impls/memmove-sse2-unaligned-erms.s +++ b/impls/memmove-sse2-unaligned-erms.s @@ -24,44 +24,6 @@ __memmove_sse2_unaligned: ; ret -.globl __mempcpy_erms -__mempcpy_erms: - mov %rdi, %rax - - test %rdx, %rdx - jz 2f - add %rdx, %rax - jmp .Lstart_movsb - -.globl __memmove_erms -__memmove_erms: - movq %rdi, %rax - - test %rdx, %rdx - jz 2f -.Lstart_movsb: - mov %rdx, %rcx - cmp %rsi, %rdi - jb 1f - - je 2f - lea (%rsi,%rcx), %rdx - cmp %rdx, %rdi - jb .Lmovsb_backward -1: - rep movsb -2: - ret -.Lmovsb_backward: - leaq -1(%rdi,%rcx), %rdi - leaq -1(%rsi,%rcx), %rsi - std - rep movsb - cld - ret -.globl __memcpy_erms -.set __memcpy_erms, __memmove_erms - .globl __mempcpy_sse2_unaligned_erms __mempcpy_sse2_unaligned_erms: mov %rdi, %rax @@ -83,7 +45,7 @@ __memmove_sse2_unaligned_erms: movups -16(%rsi, %rdx), %xmm1 movups %xmm0, (%rdi) movups %xmm1, -16(%rdi, %rdx) -.Lreturn: +.Lreturn_vzeroupper: ret .p2align 4 @@ -297,9 +259,11 @@ __memmove_sse2_unaligned_erms: .p2align 4,, 10 .Llarge_memcpy_2x_check: - cmp __x86_rep_movsb_threshold(%rip), %rdx - jb .Lmore_8x_vec_check + .Llarge_memcpy_2x: + mov __x86_shared_non_temporal_threshold(%rip), %r11 + cmp %r11, %rdx + jb .Lmore_8x_vec_check negq %rcx cmpq %rcx, %rdx @@ -329,17 +293,17 @@ __memmove_sse2_unaligned_erms: addq %r8, %rdx notl %ecx + movq %rdx, %r10 testl $(4096 - 16 * 8), %ecx jz .Llarge_memcpy_4x - movq %rdx, %r10 - shrq $4, %r10 - cmp __x86_shared_non_temporal_threshold(%rip), %r10 + shlq $4, %r11 + cmp %r11, %rdx jae .Llarge_memcpy_4x andl $(4096 * 2 - 1), %edx - shrq $((12 + 1) - 4), %r10 + shrq $(12 + 1), %r10 .p2align 4 .Lloop_large_memcpy_2x_outer: @@ -402,7 +366,6 @@ __memmove_sse2_unaligned_erms: .p2align 4 .Llarge_memcpy_4x: - movq %rdx, %r10 andl $(4096 * 4 - 1), %edx @@ -477,6 +440,3 @@ __memmove_sse2_unaligned_erms: .globl __memcpy_sse2_unaligned .set __memcpy_sse2_unaligned, __memmove_sse2_unaligned - -.globl __SI__memmove_sse2_unaligned_0 -__SI__memmove_sse2_unaligned_0: diff --git a/impls/memmove-ssse3-back.s b/impls/memmove-ssse3-back.s deleted file mode 100644 index 7bac499..0000000 --- a/impls/memmove-ssse3-back.s +++ /dev/null @@ -1,3076 +0,0 @@ - .section .text.ssse3 -.globl __memmove_ssse3_back -__memmove_ssse3_back: - mov %rdi, %rax - cmp %rsi, %rdi - jb .Lcopy_forward - je .Lbwd_write_0bytes - cmp $144, %rdx - jae .Lcopy_backward - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 -.Lcopy_forward: - -.Lstart: - cmp $144, %rdx - jae .L144bytesormore - -.Lfwd_write_less32bytes: - - add %rdx, %rsi - add %rdx, %rdi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.L144bytesormore: - - movdqu (%rsi), %xmm0 - mov %rdi, %r8 - and $-16, %rdi - add $16, %rdi - mov %rdi, %r9 - sub %r8, %r9 - sub %r9, %rdx - add %r9, %rsi - mov %rsi, %r9 - and $0xf, %r9 - jz .Lshl_0 - - mov __x86_data_cache_size(%rip), %rcx - - cmp %rcx, %rdx - jae .Lgobble_mem_fwd - lea .Lshl_table_fwd(%rip), %r11 - sub $0x80, %rdx - movslq (%r11, %r9, 4), %r9 - add %r11, %r9 - jmp *%r9 - ud2 - - .p2align 4 -.Lcopy_backward: - - mov __x86_data_cache_size(%rip), %rcx - - shl $1, %rcx - cmp %rcx, %rdx - ja .Lgobble_mem_bwd - - add %rdx, %rdi - add %rdx, %rsi - movdqu -16(%rsi), %xmm0 - lea -16(%rdi), %r8 - mov %rdi, %r9 - and $0xf, %r9 - xor %r9, %rdi - sub %r9, %rsi - sub %r9, %rdx - mov %rsi, %r9 - and $0xf, %r9 - jz .Lshl_0_bwd - lea .Lshl_table_bwd(%rip), %r11 - sub $0x80, %rdx - movslq (%r11, %r9, 4), %r9 - add %r11, %r9 - jmp *%r9 - ud2 - - .p2align 4 -.Lshl_0: - - mov %rdx, %r9 - shr $8, %r9 - add %rdx, %r9 - - cmp __x86_data_cache_size_half(%rip), %r9 - - jae .Lgobble_mem_fwd - sub $0x80, %rdx - .p2align 4 -.Lshl_0_loop: - movdqa (%rsi), %xmm1 - movdqa %xmm1, (%rdi) - movaps 0x10(%rsi), %xmm2 - movaps %xmm2, 0x10(%rdi) - movaps 0x20(%rsi), %xmm3 - movaps %xmm3, 0x20(%rdi) - movaps 0x30(%rsi), %xmm4 - movaps %xmm4, 0x30(%rdi) - movaps 0x40(%rsi), %xmm1 - movaps %xmm1, 0x40(%rdi) - movaps 0x50(%rsi), %xmm2 - movaps %xmm2, 0x50(%rdi) - movaps 0x60(%rsi), %xmm3 - movaps %xmm3, 0x60(%rdi) - movaps 0x70(%rsi), %xmm4 - movaps %xmm4, 0x70(%rdi) - sub $0x80, %rdx - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae .Lshl_0_loop - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rsi - add %rdx, %rdi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_0_bwd: - sub $0x80, %rdx -.Lcopy_backward_loop: - movaps -0x10(%rsi), %xmm1 - movaps %xmm1, -0x10(%rdi) - movaps -0x20(%rsi), %xmm2 - movaps %xmm2, -0x20(%rdi) - movaps -0x30(%rsi), %xmm3 - movaps %xmm3, -0x30(%rdi) - movaps -0x40(%rsi), %xmm4 - movaps %xmm4, -0x40(%rdi) - movaps -0x50(%rsi), %xmm5 - movaps %xmm5, -0x50(%rdi) - movaps -0x60(%rsi), %xmm5 - movaps %xmm5, -0x60(%rdi) - movaps -0x70(%rsi), %xmm5 - movaps %xmm5, -0x70(%rdi) - movaps -0x80(%rsi), %xmm5 - movaps %xmm5, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lcopy_backward_loop - - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_1: - sub $0x80, %rdx - movaps -0x01(%rsi), %xmm1 - movaps 0x0f(%rsi), %xmm2 - movaps 0x1f(%rsi), %xmm3 - movaps 0x2f(%rsi), %xmm4 - movaps 0x3f(%rsi), %xmm5 - movaps 0x4f(%rsi), %xmm6 - movaps 0x5f(%rsi), %xmm7 - movaps 0x6f(%rsi), %xmm8 - movaps 0x7f(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $1, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $1, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $1, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $1, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $1, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $1, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $1, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_1 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_1_bwd: - movaps -0x01(%rsi), %xmm1 - - movaps -0x11(%rsi), %xmm2 - palignr $1, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x21(%rsi), %xmm3 - palignr $1, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x31(%rsi), %xmm4 - palignr $1, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x41(%rsi), %xmm5 - palignr $1, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x51(%rsi), %xmm6 - palignr $1, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x61(%rsi), %xmm7 - palignr $1, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x71(%rsi), %xmm8 - palignr $1, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x81(%rsi), %xmm9 - palignr $1, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_1_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_2: - sub $0x80, %rdx - movaps -0x02(%rsi), %xmm1 - movaps 0x0e(%rsi), %xmm2 - movaps 0x1e(%rsi), %xmm3 - movaps 0x2e(%rsi), %xmm4 - movaps 0x3e(%rsi), %xmm5 - movaps 0x4e(%rsi), %xmm6 - movaps 0x5e(%rsi), %xmm7 - movaps 0x6e(%rsi), %xmm8 - movaps 0x7e(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $2, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $2, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $2, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $2, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $2, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $2, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $2, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_2 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_2_bwd: - movaps -0x02(%rsi), %xmm1 - - movaps -0x12(%rsi), %xmm2 - palignr $2, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x22(%rsi), %xmm3 - palignr $2, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x32(%rsi), %xmm4 - palignr $2, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x42(%rsi), %xmm5 - palignr $2, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x52(%rsi), %xmm6 - palignr $2, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x62(%rsi), %xmm7 - palignr $2, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x72(%rsi), %xmm8 - palignr $2, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x82(%rsi), %xmm9 - palignr $2, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_2_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_3: - sub $0x80, %rdx - movaps -0x03(%rsi), %xmm1 - movaps 0x0d(%rsi), %xmm2 - movaps 0x1d(%rsi), %xmm3 - movaps 0x2d(%rsi), %xmm4 - movaps 0x3d(%rsi), %xmm5 - movaps 0x4d(%rsi), %xmm6 - movaps 0x5d(%rsi), %xmm7 - movaps 0x6d(%rsi), %xmm8 - movaps 0x7d(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $3, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $3, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $3, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $3, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $3, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $3, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $3, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_3 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_3_bwd: - movaps -0x03(%rsi), %xmm1 - - movaps -0x13(%rsi), %xmm2 - palignr $3, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x23(%rsi), %xmm3 - palignr $3, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x33(%rsi), %xmm4 - palignr $3, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x43(%rsi), %xmm5 - palignr $3, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x53(%rsi), %xmm6 - palignr $3, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x63(%rsi), %xmm7 - palignr $3, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x73(%rsi), %xmm8 - palignr $3, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x83(%rsi), %xmm9 - palignr $3, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_3_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_4: - sub $0x80, %rdx - movaps -0x04(%rsi), %xmm1 - movaps 0x0c(%rsi), %xmm2 - movaps 0x1c(%rsi), %xmm3 - movaps 0x2c(%rsi), %xmm4 - movaps 0x3c(%rsi), %xmm5 - movaps 0x4c(%rsi), %xmm6 - movaps 0x5c(%rsi), %xmm7 - movaps 0x6c(%rsi), %xmm8 - movaps 0x7c(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $4, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $4, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $4, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $4, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $4, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $4, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $4, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_4 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_4_bwd: - movaps -0x04(%rsi), %xmm1 - - movaps -0x14(%rsi), %xmm2 - palignr $4, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x24(%rsi), %xmm3 - palignr $4, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x34(%rsi), %xmm4 - palignr $4, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x44(%rsi), %xmm5 - palignr $4, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x54(%rsi), %xmm6 - palignr $4, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x64(%rsi), %xmm7 - palignr $4, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x74(%rsi), %xmm8 - palignr $4, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x84(%rsi), %xmm9 - palignr $4, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_4_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_5: - sub $0x80, %rdx - movaps -0x05(%rsi), %xmm1 - movaps 0x0b(%rsi), %xmm2 - movaps 0x1b(%rsi), %xmm3 - movaps 0x2b(%rsi), %xmm4 - movaps 0x3b(%rsi), %xmm5 - movaps 0x4b(%rsi), %xmm6 - movaps 0x5b(%rsi), %xmm7 - movaps 0x6b(%rsi), %xmm8 - movaps 0x7b(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $5, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $5, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $5, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $5, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $5, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $5, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $5, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_5 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_5_bwd: - movaps -0x05(%rsi), %xmm1 - - movaps -0x15(%rsi), %xmm2 - palignr $5, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x25(%rsi), %xmm3 - palignr $5, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x35(%rsi), %xmm4 - palignr $5, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x45(%rsi), %xmm5 - palignr $5, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x55(%rsi), %xmm6 - palignr $5, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x65(%rsi), %xmm7 - palignr $5, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x75(%rsi), %xmm8 - palignr $5, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x85(%rsi), %xmm9 - palignr $5, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_5_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_6: - sub $0x80, %rdx - movaps -0x06(%rsi), %xmm1 - movaps 0x0a(%rsi), %xmm2 - movaps 0x1a(%rsi), %xmm3 - movaps 0x2a(%rsi), %xmm4 - movaps 0x3a(%rsi), %xmm5 - movaps 0x4a(%rsi), %xmm6 - movaps 0x5a(%rsi), %xmm7 - movaps 0x6a(%rsi), %xmm8 - movaps 0x7a(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $6, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $6, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $6, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $6, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $6, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $6, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $6, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_6 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_6_bwd: - movaps -0x06(%rsi), %xmm1 - - movaps -0x16(%rsi), %xmm2 - palignr $6, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x26(%rsi), %xmm3 - palignr $6, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x36(%rsi), %xmm4 - palignr $6, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x46(%rsi), %xmm5 - palignr $6, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x56(%rsi), %xmm6 - palignr $6, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x66(%rsi), %xmm7 - palignr $6, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x76(%rsi), %xmm8 - palignr $6, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x86(%rsi), %xmm9 - palignr $6, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_6_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_7: - sub $0x80, %rdx - movaps -0x07(%rsi), %xmm1 - movaps 0x09(%rsi), %xmm2 - movaps 0x19(%rsi), %xmm3 - movaps 0x29(%rsi), %xmm4 - movaps 0x39(%rsi), %xmm5 - movaps 0x49(%rsi), %xmm6 - movaps 0x59(%rsi), %xmm7 - movaps 0x69(%rsi), %xmm8 - movaps 0x79(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $7, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $7, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $7, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $7, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $7, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $7, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $7, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_7 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_7_bwd: - movaps -0x07(%rsi), %xmm1 - - movaps -0x17(%rsi), %xmm2 - palignr $7, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x27(%rsi), %xmm3 - palignr $7, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x37(%rsi), %xmm4 - palignr $7, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x47(%rsi), %xmm5 - palignr $7, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x57(%rsi), %xmm6 - palignr $7, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x67(%rsi), %xmm7 - palignr $7, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x77(%rsi), %xmm8 - palignr $7, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x87(%rsi), %xmm9 - palignr $7, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_7_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_8: - sub $0x80, %rdx - movaps -0x08(%rsi), %xmm1 - movaps 0x08(%rsi), %xmm2 - movaps 0x18(%rsi), %xmm3 - movaps 0x28(%rsi), %xmm4 - movaps 0x38(%rsi), %xmm5 - movaps 0x48(%rsi), %xmm6 - movaps 0x58(%rsi), %xmm7 - movaps 0x68(%rsi), %xmm8 - movaps 0x78(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $8, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $8, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $8, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $8, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $8, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $8, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $8, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_8 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_8_bwd: - movaps -0x08(%rsi), %xmm1 - - movaps -0x18(%rsi), %xmm2 - palignr $8, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x28(%rsi), %xmm3 - palignr $8, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x38(%rsi), %xmm4 - palignr $8, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x48(%rsi), %xmm5 - palignr $8, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x58(%rsi), %xmm6 - palignr $8, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x68(%rsi), %xmm7 - palignr $8, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x78(%rsi), %xmm8 - palignr $8, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x88(%rsi), %xmm9 - palignr $8, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_8_bwd -.Lshl_8_end_bwd: - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_9: - sub $0x80, %rdx - movaps -0x09(%rsi), %xmm1 - movaps 0x07(%rsi), %xmm2 - movaps 0x17(%rsi), %xmm3 - movaps 0x27(%rsi), %xmm4 - movaps 0x37(%rsi), %xmm5 - movaps 0x47(%rsi), %xmm6 - movaps 0x57(%rsi), %xmm7 - movaps 0x67(%rsi), %xmm8 - movaps 0x77(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $9, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $9, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $9, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $9, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $9, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $9, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $9, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_9 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_9_bwd: - movaps -0x09(%rsi), %xmm1 - - movaps -0x19(%rsi), %xmm2 - palignr $9, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x29(%rsi), %xmm3 - palignr $9, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x39(%rsi), %xmm4 - palignr $9, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x49(%rsi), %xmm5 - palignr $9, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x59(%rsi), %xmm6 - palignr $9, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x69(%rsi), %xmm7 - palignr $9, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x79(%rsi), %xmm8 - palignr $9, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x89(%rsi), %xmm9 - palignr $9, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_9_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_10: - sub $0x80, %rdx - movaps -0x0a(%rsi), %xmm1 - movaps 0x06(%rsi), %xmm2 - movaps 0x16(%rsi), %xmm3 - movaps 0x26(%rsi), %xmm4 - movaps 0x36(%rsi), %xmm5 - movaps 0x46(%rsi), %xmm6 - movaps 0x56(%rsi), %xmm7 - movaps 0x66(%rsi), %xmm8 - movaps 0x76(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $10, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $10, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $10, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $10, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $10, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $10, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $10, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_10 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_10_bwd: - movaps -0x0a(%rsi), %xmm1 - - movaps -0x1a(%rsi), %xmm2 - palignr $10, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2a(%rsi), %xmm3 - palignr $10, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3a(%rsi), %xmm4 - palignr $10, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4a(%rsi), %xmm5 - palignr $10, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5a(%rsi), %xmm6 - palignr $10, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6a(%rsi), %xmm7 - palignr $10, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7a(%rsi), %xmm8 - palignr $10, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8a(%rsi), %xmm9 - palignr $10, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_10_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_11: - sub $0x80, %rdx - movaps -0x0b(%rsi), %xmm1 - movaps 0x05(%rsi), %xmm2 - movaps 0x15(%rsi), %xmm3 - movaps 0x25(%rsi), %xmm4 - movaps 0x35(%rsi), %xmm5 - movaps 0x45(%rsi), %xmm6 - movaps 0x55(%rsi), %xmm7 - movaps 0x65(%rsi), %xmm8 - movaps 0x75(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $11, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $11, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $11, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $11, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $11, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $11, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $11, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_11 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_11_bwd: - movaps -0x0b(%rsi), %xmm1 - - movaps -0x1b(%rsi), %xmm2 - palignr $11, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2b(%rsi), %xmm3 - palignr $11, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3b(%rsi), %xmm4 - palignr $11, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4b(%rsi), %xmm5 - palignr $11, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5b(%rsi), %xmm6 - palignr $11, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6b(%rsi), %xmm7 - palignr $11, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7b(%rsi), %xmm8 - palignr $11, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8b(%rsi), %xmm9 - palignr $11, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_11_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_12: - sub $0x80, %rdx - movdqa -0x0c(%rsi), %xmm1 - movaps 0x04(%rsi), %xmm2 - movaps 0x14(%rsi), %xmm3 - movaps 0x24(%rsi), %xmm4 - movaps 0x34(%rsi), %xmm5 - movaps 0x44(%rsi), %xmm6 - movaps 0x54(%rsi), %xmm7 - movaps 0x64(%rsi), %xmm8 - movaps 0x74(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $12, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $12, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $12, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $12, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $12, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $12, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $12, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - - lea 0x80(%rdi), %rdi - jae .Lshl_12 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_12_bwd: - movaps -0x0c(%rsi), %xmm1 - - movaps -0x1c(%rsi), %xmm2 - palignr $12, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2c(%rsi), %xmm3 - palignr $12, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3c(%rsi), %xmm4 - palignr $12, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4c(%rsi), %xmm5 - palignr $12, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5c(%rsi), %xmm6 - palignr $12, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6c(%rsi), %xmm7 - palignr $12, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7c(%rsi), %xmm8 - palignr $12, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8c(%rsi), %xmm9 - palignr $12, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_12_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_13: - sub $0x80, %rdx - movaps -0x0d(%rsi), %xmm1 - movaps 0x03(%rsi), %xmm2 - movaps 0x13(%rsi), %xmm3 - movaps 0x23(%rsi), %xmm4 - movaps 0x33(%rsi), %xmm5 - movaps 0x43(%rsi), %xmm6 - movaps 0x53(%rsi), %xmm7 - movaps 0x63(%rsi), %xmm8 - movaps 0x73(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $13, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $13, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $13, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $13, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $13, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $13, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $13, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_13 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_13_bwd: - movaps -0x0d(%rsi), %xmm1 - - movaps -0x1d(%rsi), %xmm2 - palignr $13, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2d(%rsi), %xmm3 - palignr $13, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3d(%rsi), %xmm4 - palignr $13, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4d(%rsi), %xmm5 - palignr $13, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5d(%rsi), %xmm6 - palignr $13, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6d(%rsi), %xmm7 - palignr $13, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7d(%rsi), %xmm8 - palignr $13, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8d(%rsi), %xmm9 - palignr $13, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_13_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_14: - sub $0x80, %rdx - movaps -0x0e(%rsi), %xmm1 - movaps 0x02(%rsi), %xmm2 - movaps 0x12(%rsi), %xmm3 - movaps 0x22(%rsi), %xmm4 - movaps 0x32(%rsi), %xmm5 - movaps 0x42(%rsi), %xmm6 - movaps 0x52(%rsi), %xmm7 - movaps 0x62(%rsi), %xmm8 - movaps 0x72(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $14, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $14, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $14, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $14, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $14, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $14, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $14, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_14 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_14_bwd: - movaps -0x0e(%rsi), %xmm1 - - movaps -0x1e(%rsi), %xmm2 - palignr $14, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2e(%rsi), %xmm3 - palignr $14, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3e(%rsi), %xmm4 - palignr $14, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4e(%rsi), %xmm5 - palignr $14, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5e(%rsi), %xmm6 - palignr $14, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6e(%rsi), %xmm7 - palignr $14, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7e(%rsi), %xmm8 - palignr $14, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8e(%rsi), %xmm9 - palignr $14, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_14_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_15: - sub $0x80, %rdx - movaps -0x0f(%rsi), %xmm1 - movaps 0x01(%rsi), %xmm2 - movaps 0x11(%rsi), %xmm3 - movaps 0x21(%rsi), %xmm4 - movaps 0x31(%rsi), %xmm5 - movaps 0x41(%rsi), %xmm6 - movaps 0x51(%rsi), %xmm7 - movaps 0x61(%rsi), %xmm8 - movaps 0x71(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $15, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $15, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $15, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $15, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $15, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $15, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $15, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae .Lshl_15 - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_15_bwd: - movaps -0x0f(%rsi), %xmm1 - - movaps -0x1f(%rsi), %xmm2 - palignr $15, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2f(%rsi), %xmm3 - palignr $15, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3f(%rsi), %xmm4 - palignr $15, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4f(%rsi), %xmm5 - palignr $15, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5f(%rsi), %xmm6 - palignr $15, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6f(%rsi), %xmm7 - palignr $15, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7f(%rsi), %xmm8 - palignr $15, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8f(%rsi), %xmm9 - palignr $15, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae .Lshl_15_bwd - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lgobble_mem_fwd: - movdqu (%rsi), %xmm1 - movdqu %xmm0, (%r8) - movdqa %xmm1, (%rdi) - sub $16, %rdx - add $16, %rsi - add $16, %rdi - - mov __x86_shared_cache_size_half(%rip), %rcx - - mov %rsi, %r9 - sub %rdi, %r9 - cmp %rdx, %r9 - jae .Lmemmove_is_memcpy_fwd - cmp %rcx, %r9 - jbe .Lll_cache_copy_fwd_start -.Lmemmove_is_memcpy_fwd: - - cmp %rcx, %rdx - ja .Lbigger_in_fwd - mov %rdx, %rcx -.Lbigger_in_fwd: - sub %rcx, %rdx - cmp $0x1000, %rdx - jbe .Lll_cache_copy_fwd - - mov %rcx, %r9 - shl $3, %r9 - cmp %r9, %rdx - jbe .L2steps_copy_fwd - add %rcx, %rdx - xor %rcx, %rcx -.L2steps_copy_fwd: - sub $0x80, %rdx -.Lgobble_mem_fwd_loop: - sub $0x80, %rdx - prefetcht0 0x200(%rsi) - prefetcht0 0x300(%rsi) - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lfence - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - movntdq %xmm4, 0x40(%rdi) - movntdq %xmm5, 0x50(%rdi) - movntdq %xmm6, 0x60(%rdi) - movntdq %xmm7, 0x70(%rdi) - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae .Lgobble_mem_fwd_loop - sfence - cmp $0x80, %rcx - jb .Lgobble_mem_fwd_end - add $0x80, %rdx -.Lll_cache_copy_fwd: - add %rcx, %rdx -.Lll_cache_copy_fwd_start: - sub $0x80, %rdx -.Lgobble_ll_loop_fwd: - prefetchnta 0x1c0(%rsi) - prefetchnta 0x280(%rsi) - prefetchnta 0x1c0(%rdi) - prefetchnta 0x280(%rdi) - sub $0x80, %rdx - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - movdqa %xmm2, 0x20(%rdi) - movdqa %xmm3, 0x30(%rdi) - movdqa %xmm4, 0x40(%rdi) - movdqa %xmm5, 0x50(%rdi) - movdqa %xmm6, 0x60(%rdi) - movdqa %xmm7, 0x70(%rdi) - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae .Lgobble_ll_loop_fwd -.Lgobble_mem_fwd_end: - add $0x80, %rdx - add %rdx, %rsi - add %rdx, %rdi - lea .Ltable_144_bytes_fwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lgobble_mem_bwd: - add %rdx, %rsi - add %rdx, %rdi - - movdqu -16(%rsi), %xmm0 - lea -16(%rdi), %r8 - mov %rdi, %r9 - and $-16, %rdi - sub %rdi, %r9 - sub %r9, %rsi - sub %r9, %rdx - - mov __x86_shared_cache_size_half(%rip), %rcx - - mov %rdi, %r9 - sub %rsi, %r9 - cmp %rdx, %r9 - jae .Lmemmove_is_memcpy_bwd - cmp %rcx, %r9 - jbe .Lll_cache_copy_bwd_start -.Lmemmove_is_memcpy_bwd: - - cmp %rcx, %rdx - ja .Lbigger - mov %rdx, %rcx -.Lbigger: - sub %rcx, %rdx - cmp $0x1000, %rdx - jbe .Lll_cache_copy - - mov %rcx, %r9 - shl $3, %r9 - cmp %r9, %rdx - jbe .L2steps_copy - add %rcx, %rdx - xor %rcx, %rcx -.L2steps_copy: - sub $0x80, %rdx -.Lgobble_mem_bwd_loop: - sub $0x80, %rdx - prefetcht0 -0x200(%rsi) - prefetcht0 -0x300(%rsi) - movdqu -0x10(%rsi), %xmm1 - movdqu -0x20(%rsi), %xmm2 - movdqu -0x30(%rsi), %xmm3 - movdqu -0x40(%rsi), %xmm4 - movdqu -0x50(%rsi), %xmm5 - movdqu -0x60(%rsi), %xmm6 - movdqu -0x70(%rsi), %xmm7 - movdqu -0x80(%rsi), %xmm8 - lfence - movntdq %xmm1, -0x10(%rdi) - movntdq %xmm2, -0x20(%rdi) - movntdq %xmm3, -0x30(%rdi) - movntdq %xmm4, -0x40(%rdi) - movntdq %xmm5, -0x50(%rdi) - movntdq %xmm6, -0x60(%rdi) - movntdq %xmm7, -0x70(%rdi) - movntdq %xmm8, -0x80(%rdi) - lea -0x80(%rsi), %rsi - lea -0x80(%rdi), %rdi - jae .Lgobble_mem_bwd_loop - sfence - cmp $0x80, %rcx - jb .Lgobble_mem_bwd_end - add $0x80, %rdx -.Lll_cache_copy: - add %rcx, %rdx -.Lll_cache_copy_bwd_start: - sub $0x80, %rdx -.Lgobble_ll_loop: - prefetchnta -0x1c0(%rsi) - prefetchnta -0x280(%rsi) - prefetchnta -0x1c0(%rdi) - prefetchnta -0x280(%rdi) - sub $0x80, %rdx - movdqu -0x10(%rsi), %xmm1 - movdqu -0x20(%rsi), %xmm2 - movdqu -0x30(%rsi), %xmm3 - movdqu -0x40(%rsi), %xmm4 - movdqu -0x50(%rsi), %xmm5 - movdqu -0x60(%rsi), %xmm6 - movdqu -0x70(%rsi), %xmm7 - movdqu -0x80(%rsi), %xmm8 - movdqa %xmm1, -0x10(%rdi) - movdqa %xmm2, -0x20(%rdi) - movdqa %xmm3, -0x30(%rdi) - movdqa %xmm4, -0x40(%rdi) - movdqa %xmm5, -0x50(%rdi) - movdqa %xmm6, -0x60(%rdi) - movdqa %xmm7, -0x70(%rdi) - movdqa %xmm8, -0x80(%rdi) - lea -0x80(%rsi), %rsi - lea -0x80(%rdi), %rdi - jae .Lgobble_ll_loop -.Lgobble_mem_bwd_end: - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rsi - sub %rdx, %rdi - lea .Ltable_144_bytes_bwd(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lfwd_write_128bytes: - lddqu -128(%rsi), %xmm0 - movdqu %xmm0, -128(%rdi) -.Lfwd_write_112bytes: - lddqu -112(%rsi), %xmm0 - movdqu %xmm0, -112(%rdi) -.Lfwd_write_96bytes: - lddqu -96(%rsi), %xmm0 - movdqu %xmm0, -96(%rdi) -.Lfwd_write_80bytes: - lddqu -80(%rsi), %xmm0 - movdqu %xmm0, -80(%rdi) -.Lfwd_write_64bytes: - lddqu -64(%rsi), %xmm0 - movdqu %xmm0, -64(%rdi) -.Lfwd_write_48bytes: - lddqu -48(%rsi), %xmm0 - movdqu %xmm0, -48(%rdi) -.Lfwd_write_32bytes: - lddqu -32(%rsi), %xmm0 - movdqu %xmm0, -32(%rdi) -.Lfwd_write_16bytes: - lddqu -16(%rsi), %xmm0 - movdqu %xmm0, -16(%rdi) -.Lfwd_write_0bytes: - ret - - .p2align 4 -.Lfwd_write_143bytes: - lddqu -143(%rsi), %xmm0 - movdqu %xmm0, -143(%rdi) -.Lfwd_write_127bytes: - lddqu -127(%rsi), %xmm0 - movdqu %xmm0, -127(%rdi) -.Lfwd_write_111bytes: - lddqu -111(%rsi), %xmm0 - movdqu %xmm0, -111(%rdi) -.Lfwd_write_95bytes: - lddqu -95(%rsi), %xmm0 - movdqu %xmm0, -95(%rdi) -.Lfwd_write_79bytes: - lddqu -79(%rsi), %xmm0 - movdqu %xmm0, -79(%rdi) -.Lfwd_write_63bytes: - lddqu -63(%rsi), %xmm0 - movdqu %xmm0, -63(%rdi) -.Lfwd_write_47bytes: - lddqu -47(%rsi), %xmm0 - movdqu %xmm0, -47(%rdi) -.Lfwd_write_31bytes: - lddqu -31(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -31(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_15bytes: - mov -15(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -15(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -.Lfwd_write_142bytes: - lddqu -142(%rsi), %xmm0 - movdqu %xmm0, -142(%rdi) -.Lfwd_write_126bytes: - lddqu -126(%rsi), %xmm0 - movdqu %xmm0, -126(%rdi) -.Lfwd_write_110bytes: - lddqu -110(%rsi), %xmm0 - movdqu %xmm0, -110(%rdi) -.Lfwd_write_94bytes: - lddqu -94(%rsi), %xmm0 - movdqu %xmm0, -94(%rdi) -.Lfwd_write_78bytes: - lddqu -78(%rsi), %xmm0 - movdqu %xmm0, -78(%rdi) -.Lfwd_write_62bytes: - lddqu -62(%rsi), %xmm0 - movdqu %xmm0, -62(%rdi) -.Lfwd_write_46bytes: - lddqu -46(%rsi), %xmm0 - movdqu %xmm0, -46(%rdi) -.Lfwd_write_30bytes: - lddqu -30(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -30(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_14bytes: - mov -14(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -14(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -.Lfwd_write_141bytes: - lddqu -141(%rsi), %xmm0 - movdqu %xmm0, -141(%rdi) -.Lfwd_write_125bytes: - lddqu -125(%rsi), %xmm0 - movdqu %xmm0, -125(%rdi) -.Lfwd_write_109bytes: - lddqu -109(%rsi), %xmm0 - movdqu %xmm0, -109(%rdi) -.Lfwd_write_93bytes: - lddqu -93(%rsi), %xmm0 - movdqu %xmm0, -93(%rdi) -.Lfwd_write_77bytes: - lddqu -77(%rsi), %xmm0 - movdqu %xmm0, -77(%rdi) -.Lfwd_write_61bytes: - lddqu -61(%rsi), %xmm0 - movdqu %xmm0, -61(%rdi) -.Lfwd_write_45bytes: - lddqu -45(%rsi), %xmm0 - movdqu %xmm0, -45(%rdi) -.Lfwd_write_29bytes: - lddqu -29(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -29(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_13bytes: - mov -13(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -13(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -.Lfwd_write_140bytes: - lddqu -140(%rsi), %xmm0 - movdqu %xmm0, -140(%rdi) -.Lfwd_write_124bytes: - lddqu -124(%rsi), %xmm0 - movdqu %xmm0, -124(%rdi) -.Lfwd_write_108bytes: - lddqu -108(%rsi), %xmm0 - movdqu %xmm0, -108(%rdi) -.Lfwd_write_92bytes: - lddqu -92(%rsi), %xmm0 - movdqu %xmm0, -92(%rdi) -.Lfwd_write_76bytes: - lddqu -76(%rsi), %xmm0 - movdqu %xmm0, -76(%rdi) -.Lfwd_write_60bytes: - lddqu -60(%rsi), %xmm0 - movdqu %xmm0, -60(%rdi) -.Lfwd_write_44bytes: - lddqu -44(%rsi), %xmm0 - movdqu %xmm0, -44(%rdi) -.Lfwd_write_28bytes: - lddqu -28(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -28(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_12bytes: - mov -12(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -12(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -.Lfwd_write_139bytes: - lddqu -139(%rsi), %xmm0 - movdqu %xmm0, -139(%rdi) -.Lfwd_write_123bytes: - lddqu -123(%rsi), %xmm0 - movdqu %xmm0, -123(%rdi) -.Lfwd_write_107bytes: - lddqu -107(%rsi), %xmm0 - movdqu %xmm0, -107(%rdi) -.Lfwd_write_91bytes: - lddqu -91(%rsi), %xmm0 - movdqu %xmm0, -91(%rdi) -.Lfwd_write_75bytes: - lddqu -75(%rsi), %xmm0 - movdqu %xmm0, -75(%rdi) -.Lfwd_write_59bytes: - lddqu -59(%rsi), %xmm0 - movdqu %xmm0, -59(%rdi) -.Lfwd_write_43bytes: - lddqu -43(%rsi), %xmm0 - movdqu %xmm0, -43(%rdi) -.Lfwd_write_27bytes: - lddqu -27(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -27(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_11bytes: - mov -11(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -11(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -.Lfwd_write_138bytes: - lddqu -138(%rsi), %xmm0 - movdqu %xmm0, -138(%rdi) -.Lfwd_write_122bytes: - lddqu -122(%rsi), %xmm0 - movdqu %xmm0, -122(%rdi) -.Lfwd_write_106bytes: - lddqu -106(%rsi), %xmm0 - movdqu %xmm0, -106(%rdi) -.Lfwd_write_90bytes: - lddqu -90(%rsi), %xmm0 - movdqu %xmm0, -90(%rdi) -.Lfwd_write_74bytes: - lddqu -74(%rsi), %xmm0 - movdqu %xmm0, -74(%rdi) -.Lfwd_write_58bytes: - lddqu -58(%rsi), %xmm0 - movdqu %xmm0, -58(%rdi) -.Lfwd_write_42bytes: - lddqu -42(%rsi), %xmm0 - movdqu %xmm0, -42(%rdi) -.Lfwd_write_26bytes: - lddqu -26(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -26(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_10bytes: - mov -10(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -10(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -.Lfwd_write_137bytes: - lddqu -137(%rsi), %xmm0 - movdqu %xmm0, -137(%rdi) -.Lfwd_write_121bytes: - lddqu -121(%rsi), %xmm0 - movdqu %xmm0, -121(%rdi) -.Lfwd_write_105bytes: - lddqu -105(%rsi), %xmm0 - movdqu %xmm0, -105(%rdi) -.Lfwd_write_89bytes: - lddqu -89(%rsi), %xmm0 - movdqu %xmm0, -89(%rdi) -.Lfwd_write_73bytes: - lddqu -73(%rsi), %xmm0 - movdqu %xmm0, -73(%rdi) -.Lfwd_write_57bytes: - lddqu -57(%rsi), %xmm0 - movdqu %xmm0, -57(%rdi) -.Lfwd_write_41bytes: - lddqu -41(%rsi), %xmm0 - movdqu %xmm0, -41(%rdi) -.Lfwd_write_25bytes: - lddqu -25(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -25(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_9bytes: - mov -9(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -9(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -.Lfwd_write_136bytes: - lddqu -136(%rsi), %xmm0 - movdqu %xmm0, -136(%rdi) -.Lfwd_write_120bytes: - lddqu -120(%rsi), %xmm0 - movdqu %xmm0, -120(%rdi) -.Lfwd_write_104bytes: - lddqu -104(%rsi), %xmm0 - movdqu %xmm0, -104(%rdi) -.Lfwd_write_88bytes: - lddqu -88(%rsi), %xmm0 - movdqu %xmm0, -88(%rdi) -.Lfwd_write_72bytes: - lddqu -72(%rsi), %xmm0 - movdqu %xmm0, -72(%rdi) -.Lfwd_write_56bytes: - lddqu -56(%rsi), %xmm0 - movdqu %xmm0, -56(%rdi) -.Lfwd_write_40bytes: - lddqu -40(%rsi), %xmm0 - movdqu %xmm0, -40(%rdi) -.Lfwd_write_24bytes: - lddqu -24(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -24(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_8bytes: - mov -8(%rsi), %rdx - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lfwd_write_135bytes: - lddqu -135(%rsi), %xmm0 - movdqu %xmm0, -135(%rdi) -.Lfwd_write_119bytes: - lddqu -119(%rsi), %xmm0 - movdqu %xmm0, -119(%rdi) -.Lfwd_write_103bytes: - lddqu -103(%rsi), %xmm0 - movdqu %xmm0, -103(%rdi) -.Lfwd_write_87bytes: - lddqu -87(%rsi), %xmm0 - movdqu %xmm0, -87(%rdi) -.Lfwd_write_71bytes: - lddqu -71(%rsi), %xmm0 - movdqu %xmm0, -71(%rdi) -.Lfwd_write_55bytes: - lddqu -55(%rsi), %xmm0 - movdqu %xmm0, -55(%rdi) -.Lfwd_write_39bytes: - lddqu -39(%rsi), %xmm0 - movdqu %xmm0, -39(%rdi) -.Lfwd_write_23bytes: - lddqu -23(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -23(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_7bytes: - mov -7(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -7(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -.Lfwd_write_134bytes: - lddqu -134(%rsi), %xmm0 - movdqu %xmm0, -134(%rdi) -.Lfwd_write_118bytes: - lddqu -118(%rsi), %xmm0 - movdqu %xmm0, -118(%rdi) -.Lfwd_write_102bytes: - lddqu -102(%rsi), %xmm0 - movdqu %xmm0, -102(%rdi) -.Lfwd_write_86bytes: - lddqu -86(%rsi), %xmm0 - movdqu %xmm0, -86(%rdi) -.Lfwd_write_70bytes: - lddqu -70(%rsi), %xmm0 - movdqu %xmm0, -70(%rdi) -.Lfwd_write_54bytes: - lddqu -54(%rsi), %xmm0 - movdqu %xmm0, -54(%rdi) -.Lfwd_write_38bytes: - lddqu -38(%rsi), %xmm0 - movdqu %xmm0, -38(%rdi) -.Lfwd_write_22bytes: - lddqu -22(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -22(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_6bytes: - mov -6(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -6(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -.Lfwd_write_133bytes: - lddqu -133(%rsi), %xmm0 - movdqu %xmm0, -133(%rdi) -.Lfwd_write_117bytes: - lddqu -117(%rsi), %xmm0 - movdqu %xmm0, -117(%rdi) -.Lfwd_write_101bytes: - lddqu -101(%rsi), %xmm0 - movdqu %xmm0, -101(%rdi) -.Lfwd_write_85bytes: - lddqu -85(%rsi), %xmm0 - movdqu %xmm0, -85(%rdi) -.Lfwd_write_69bytes: - lddqu -69(%rsi), %xmm0 - movdqu %xmm0, -69(%rdi) -.Lfwd_write_53bytes: - lddqu -53(%rsi), %xmm0 - movdqu %xmm0, -53(%rdi) -.Lfwd_write_37bytes: - lddqu -37(%rsi), %xmm0 - movdqu %xmm0, -37(%rdi) -.Lfwd_write_21bytes: - lddqu -21(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -21(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_5bytes: - mov -5(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -5(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -.Lfwd_write_132bytes: - lddqu -132(%rsi), %xmm0 - movdqu %xmm0, -132(%rdi) -.Lfwd_write_116bytes: - lddqu -116(%rsi), %xmm0 - movdqu %xmm0, -116(%rdi) -.Lfwd_write_100bytes: - lddqu -100(%rsi), %xmm0 - movdqu %xmm0, -100(%rdi) -.Lfwd_write_84bytes: - lddqu -84(%rsi), %xmm0 - movdqu %xmm0, -84(%rdi) -.Lfwd_write_68bytes: - lddqu -68(%rsi), %xmm0 - movdqu %xmm0, -68(%rdi) -.Lfwd_write_52bytes: - lddqu -52(%rsi), %xmm0 - movdqu %xmm0, -52(%rdi) -.Lfwd_write_36bytes: - lddqu -36(%rsi), %xmm0 - movdqu %xmm0, -36(%rdi) -.Lfwd_write_20bytes: - lddqu -20(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -20(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_4bytes: - mov -4(%rsi), %edx - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lfwd_write_131bytes: - lddqu -131(%rsi), %xmm0 - movdqu %xmm0, -131(%rdi) -.Lfwd_write_115bytes: - lddqu -115(%rsi), %xmm0 - movdqu %xmm0, -115(%rdi) -.Lfwd_write_99bytes: - lddqu -99(%rsi), %xmm0 - movdqu %xmm0, -99(%rdi) -.Lfwd_write_83bytes: - lddqu -83(%rsi), %xmm0 - movdqu %xmm0, -83(%rdi) -.Lfwd_write_67bytes: - lddqu -67(%rsi), %xmm0 - movdqu %xmm0, -67(%rdi) -.Lfwd_write_51bytes: - lddqu -51(%rsi), %xmm0 - movdqu %xmm0, -51(%rdi) -.Lfwd_write_35bytes: - lddqu -35(%rsi), %xmm0 - movdqu %xmm0, -35(%rdi) -.Lfwd_write_19bytes: - lddqu -19(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -19(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_3bytes: - mov -3(%rsi), %dx - mov -2(%rsi), %cx - mov %dx, -3(%rdi) - mov %cx, -2(%rdi) - ret - - .p2align 4 -.Lfwd_write_130bytes: - lddqu -130(%rsi), %xmm0 - movdqu %xmm0, -130(%rdi) -.Lfwd_write_114bytes: - lddqu -114(%rsi), %xmm0 - movdqu %xmm0, -114(%rdi) -.Lfwd_write_98bytes: - lddqu -98(%rsi), %xmm0 - movdqu %xmm0, -98(%rdi) -.Lfwd_write_82bytes: - lddqu -82(%rsi), %xmm0 - movdqu %xmm0, -82(%rdi) -.Lfwd_write_66bytes: - lddqu -66(%rsi), %xmm0 - movdqu %xmm0, -66(%rdi) -.Lfwd_write_50bytes: - lddqu -50(%rsi), %xmm0 - movdqu %xmm0, -50(%rdi) -.Lfwd_write_34bytes: - lddqu -34(%rsi), %xmm0 - movdqu %xmm0, -34(%rdi) -.Lfwd_write_18bytes: - lddqu -18(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -18(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_2bytes: - movzwl -2(%rsi), %edx - mov %dx, -2(%rdi) - ret - - .p2align 4 -.Lfwd_write_129bytes: - lddqu -129(%rsi), %xmm0 - movdqu %xmm0, -129(%rdi) -.Lfwd_write_113bytes: - lddqu -113(%rsi), %xmm0 - movdqu %xmm0, -113(%rdi) -.Lfwd_write_97bytes: - lddqu -97(%rsi), %xmm0 - movdqu %xmm0, -97(%rdi) -.Lfwd_write_81bytes: - lddqu -81(%rsi), %xmm0 - movdqu %xmm0, -81(%rdi) -.Lfwd_write_65bytes: - lddqu -65(%rsi), %xmm0 - movdqu %xmm0, -65(%rdi) -.Lfwd_write_49bytes: - lddqu -49(%rsi), %xmm0 - movdqu %xmm0, -49(%rdi) -.Lfwd_write_33bytes: - lddqu -33(%rsi), %xmm0 - movdqu %xmm0, -33(%rdi) -.Lfwd_write_17bytes: - lddqu -17(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -17(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -.Lfwd_write_1bytes: - movzbl -1(%rsi), %edx - mov %dl, -1(%rdi) - ret - - .p2align 4 -.Lbwd_write_128bytes: - lddqu 112(%rsi), %xmm0 - movdqu %xmm0, 112(%rdi) -.Lbwd_write_112bytes: - lddqu 96(%rsi), %xmm0 - movdqu %xmm0, 96(%rdi) -.Lbwd_write_96bytes: - lddqu 80(%rsi), %xmm0 - movdqu %xmm0, 80(%rdi) -.Lbwd_write_80bytes: - lddqu 64(%rsi), %xmm0 - movdqu %xmm0, 64(%rdi) -.Lbwd_write_64bytes: - lddqu 48(%rsi), %xmm0 - movdqu %xmm0, 48(%rdi) -.Lbwd_write_48bytes: - lddqu 32(%rsi), %xmm0 - movdqu %xmm0, 32(%rdi) -.Lbwd_write_32bytes: - lddqu 16(%rsi), %xmm0 - movdqu %xmm0, 16(%rdi) -.Lbwd_write_16bytes: - lddqu (%rsi), %xmm0 - movdqu %xmm0, (%rdi) -.Lbwd_write_0bytes: - ret - - .p2align 4 -.Lbwd_write_143bytes: - lddqu 127(%rsi), %xmm0 - movdqu %xmm0, 127(%rdi) -.Lbwd_write_127bytes: - lddqu 111(%rsi), %xmm0 - movdqu %xmm0, 111(%rdi) -.Lbwd_write_111bytes: - lddqu 95(%rsi), %xmm0 - movdqu %xmm0, 95(%rdi) -.Lbwd_write_95bytes: - lddqu 79(%rsi), %xmm0 - movdqu %xmm0, 79(%rdi) -.Lbwd_write_79bytes: - lddqu 63(%rsi), %xmm0 - movdqu %xmm0, 63(%rdi) -.Lbwd_write_63bytes: - lddqu 47(%rsi), %xmm0 - movdqu %xmm0, 47(%rdi) -.Lbwd_write_47bytes: - lddqu 31(%rsi), %xmm0 - movdqu %xmm0, 31(%rdi) -.Lbwd_write_31bytes: - lddqu 15(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 15(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_15bytes: - mov 7(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 7(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_142bytes: - lddqu 126(%rsi), %xmm0 - movdqu %xmm0, 126(%rdi) -.Lbwd_write_126bytes: - lddqu 110(%rsi), %xmm0 - movdqu %xmm0, 110(%rdi) -.Lbwd_write_110bytes: - lddqu 94(%rsi), %xmm0 - movdqu %xmm0, 94(%rdi) -.Lbwd_write_94bytes: - lddqu 78(%rsi), %xmm0 - movdqu %xmm0, 78(%rdi) -.Lbwd_write_78bytes: - lddqu 62(%rsi), %xmm0 - movdqu %xmm0, 62(%rdi) -.Lbwd_write_62bytes: - lddqu 46(%rsi), %xmm0 - movdqu %xmm0, 46(%rdi) -.Lbwd_write_46bytes: - lddqu 30(%rsi), %xmm0 - movdqu %xmm0, 30(%rdi) -.Lbwd_write_30bytes: - lddqu 14(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 14(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_14bytes: - mov 6(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 6(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_141bytes: - lddqu 125(%rsi), %xmm0 - movdqu %xmm0, 125(%rdi) -.Lbwd_write_125bytes: - lddqu 109(%rsi), %xmm0 - movdqu %xmm0, 109(%rdi) -.Lbwd_write_109bytes: - lddqu 93(%rsi), %xmm0 - movdqu %xmm0, 93(%rdi) -.Lbwd_write_93bytes: - lddqu 77(%rsi), %xmm0 - movdqu %xmm0, 77(%rdi) -.Lbwd_write_77bytes: - lddqu 61(%rsi), %xmm0 - movdqu %xmm0, 61(%rdi) -.Lbwd_write_61bytes: - lddqu 45(%rsi), %xmm0 - movdqu %xmm0, 45(%rdi) -.Lbwd_write_45bytes: - lddqu 29(%rsi), %xmm0 - movdqu %xmm0, 29(%rdi) -.Lbwd_write_29bytes: - lddqu 13(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 13(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_13bytes: - mov 5(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 5(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_140bytes: - lddqu 124(%rsi), %xmm0 - movdqu %xmm0, 124(%rdi) -.Lbwd_write_124bytes: - lddqu 108(%rsi), %xmm0 - movdqu %xmm0, 108(%rdi) -.Lbwd_write_108bytes: - lddqu 92(%rsi), %xmm0 - movdqu %xmm0, 92(%rdi) -.Lbwd_write_92bytes: - lddqu 76(%rsi), %xmm0 - movdqu %xmm0, 76(%rdi) -.Lbwd_write_76bytes: - lddqu 60(%rsi), %xmm0 - movdqu %xmm0, 60(%rdi) -.Lbwd_write_60bytes: - lddqu 44(%rsi), %xmm0 - movdqu %xmm0, 44(%rdi) -.Lbwd_write_44bytes: - lddqu 28(%rsi), %xmm0 - movdqu %xmm0, 28(%rdi) -.Lbwd_write_28bytes: - lddqu 12(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 12(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_12bytes: - mov 4(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 4(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_139bytes: - lddqu 123(%rsi), %xmm0 - movdqu %xmm0, 123(%rdi) -.Lbwd_write_123bytes: - lddqu 107(%rsi), %xmm0 - movdqu %xmm0, 107(%rdi) -.Lbwd_write_107bytes: - lddqu 91(%rsi), %xmm0 - movdqu %xmm0, 91(%rdi) -.Lbwd_write_91bytes: - lddqu 75(%rsi), %xmm0 - movdqu %xmm0, 75(%rdi) -.Lbwd_write_75bytes: - lddqu 59(%rsi), %xmm0 - movdqu %xmm0, 59(%rdi) -.Lbwd_write_59bytes: - lddqu 43(%rsi), %xmm0 - movdqu %xmm0, 43(%rdi) -.Lbwd_write_43bytes: - lddqu 27(%rsi), %xmm0 - movdqu %xmm0, 27(%rdi) -.Lbwd_write_27bytes: - lddqu 11(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 11(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_11bytes: - mov 3(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 3(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_138bytes: - lddqu 122(%rsi), %xmm0 - movdqu %xmm0, 122(%rdi) -.Lbwd_write_122bytes: - lddqu 106(%rsi), %xmm0 - movdqu %xmm0, 106(%rdi) -.Lbwd_write_106bytes: - lddqu 90(%rsi), %xmm0 - movdqu %xmm0, 90(%rdi) -.Lbwd_write_90bytes: - lddqu 74(%rsi), %xmm0 - movdqu %xmm0, 74(%rdi) -.Lbwd_write_74bytes: - lddqu 58(%rsi), %xmm0 - movdqu %xmm0, 58(%rdi) -.Lbwd_write_58bytes: - lddqu 42(%rsi), %xmm0 - movdqu %xmm0, 42(%rdi) -.Lbwd_write_42bytes: - lddqu 26(%rsi), %xmm0 - movdqu %xmm0, 26(%rdi) -.Lbwd_write_26bytes: - lddqu 10(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 10(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_10bytes: - mov 2(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 2(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_137bytes: - lddqu 121(%rsi), %xmm0 - movdqu %xmm0, 121(%rdi) -.Lbwd_write_121bytes: - lddqu 105(%rsi), %xmm0 - movdqu %xmm0, 105(%rdi) -.Lbwd_write_105bytes: - lddqu 89(%rsi), %xmm0 - movdqu %xmm0, 89(%rdi) -.Lbwd_write_89bytes: - lddqu 73(%rsi), %xmm0 - movdqu %xmm0, 73(%rdi) -.Lbwd_write_73bytes: - lddqu 57(%rsi), %xmm0 - movdqu %xmm0, 57(%rdi) -.Lbwd_write_57bytes: - lddqu 41(%rsi), %xmm0 - movdqu %xmm0, 41(%rdi) -.Lbwd_write_41bytes: - lddqu 25(%rsi), %xmm0 - movdqu %xmm0, 25(%rdi) -.Lbwd_write_25bytes: - lddqu 9(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 9(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_9bytes: - mov 1(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 1(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_136bytes: - lddqu 120(%rsi), %xmm0 - movdqu %xmm0, 120(%rdi) -.Lbwd_write_120bytes: - lddqu 104(%rsi), %xmm0 - movdqu %xmm0, 104(%rdi) -.Lbwd_write_104bytes: - lddqu 88(%rsi), %xmm0 - movdqu %xmm0, 88(%rdi) -.Lbwd_write_88bytes: - lddqu 72(%rsi), %xmm0 - movdqu %xmm0, 72(%rdi) -.Lbwd_write_72bytes: - lddqu 56(%rsi), %xmm0 - movdqu %xmm0, 56(%rdi) -.Lbwd_write_56bytes: - lddqu 40(%rsi), %xmm0 - movdqu %xmm0, 40(%rdi) -.Lbwd_write_40bytes: - lddqu 24(%rsi), %xmm0 - movdqu %xmm0, 24(%rdi) -.Lbwd_write_24bytes: - lddqu 8(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 8(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_8bytes: - mov (%rsi), %rdx - mov %rdx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_135bytes: - lddqu 119(%rsi), %xmm0 - movdqu %xmm0, 119(%rdi) -.Lbwd_write_119bytes: - lddqu 103(%rsi), %xmm0 - movdqu %xmm0, 103(%rdi) -.Lbwd_write_103bytes: - lddqu 87(%rsi), %xmm0 - movdqu %xmm0, 87(%rdi) -.Lbwd_write_87bytes: - lddqu 71(%rsi), %xmm0 - movdqu %xmm0, 71(%rdi) -.Lbwd_write_71bytes: - lddqu 55(%rsi), %xmm0 - movdqu %xmm0, 55(%rdi) -.Lbwd_write_55bytes: - lddqu 39(%rsi), %xmm0 - movdqu %xmm0, 39(%rdi) -.Lbwd_write_39bytes: - lddqu 23(%rsi), %xmm0 - movdqu %xmm0, 23(%rdi) -.Lbwd_write_23bytes: - lddqu 7(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 7(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_7bytes: - mov 3(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 3(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_134bytes: - lddqu 118(%rsi), %xmm0 - movdqu %xmm0, 118(%rdi) -.Lbwd_write_118bytes: - lddqu 102(%rsi), %xmm0 - movdqu %xmm0, 102(%rdi) -.Lbwd_write_102bytes: - lddqu 86(%rsi), %xmm0 - movdqu %xmm0, 86(%rdi) -.Lbwd_write_86bytes: - lddqu 70(%rsi), %xmm0 - movdqu %xmm0, 70(%rdi) -.Lbwd_write_70bytes: - lddqu 54(%rsi), %xmm0 - movdqu %xmm0, 54(%rdi) -.Lbwd_write_54bytes: - lddqu 38(%rsi), %xmm0 - movdqu %xmm0, 38(%rdi) -.Lbwd_write_38bytes: - lddqu 22(%rsi), %xmm0 - movdqu %xmm0, 22(%rdi) -.Lbwd_write_22bytes: - lddqu 6(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 6(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_6bytes: - mov 2(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 2(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_133bytes: - lddqu 117(%rsi), %xmm0 - movdqu %xmm0, 117(%rdi) -.Lbwd_write_117bytes: - lddqu 101(%rsi), %xmm0 - movdqu %xmm0, 101(%rdi) -.Lbwd_write_101bytes: - lddqu 85(%rsi), %xmm0 - movdqu %xmm0, 85(%rdi) -.Lbwd_write_85bytes: - lddqu 69(%rsi), %xmm0 - movdqu %xmm0, 69(%rdi) -.Lbwd_write_69bytes: - lddqu 53(%rsi), %xmm0 - movdqu %xmm0, 53(%rdi) -.Lbwd_write_53bytes: - lddqu 37(%rsi), %xmm0 - movdqu %xmm0, 37(%rdi) -.Lbwd_write_37bytes: - lddqu 21(%rsi), %xmm0 - movdqu %xmm0, 21(%rdi) -.Lbwd_write_21bytes: - lddqu 5(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 5(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_5bytes: - mov 1(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 1(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_132bytes: - lddqu 116(%rsi), %xmm0 - movdqu %xmm0, 116(%rdi) -.Lbwd_write_116bytes: - lddqu 100(%rsi), %xmm0 - movdqu %xmm0, 100(%rdi) -.Lbwd_write_100bytes: - lddqu 84(%rsi), %xmm0 - movdqu %xmm0, 84(%rdi) -.Lbwd_write_84bytes: - lddqu 68(%rsi), %xmm0 - movdqu %xmm0, 68(%rdi) -.Lbwd_write_68bytes: - lddqu 52(%rsi), %xmm0 - movdqu %xmm0, 52(%rdi) -.Lbwd_write_52bytes: - lddqu 36(%rsi), %xmm0 - movdqu %xmm0, 36(%rdi) -.Lbwd_write_36bytes: - lddqu 20(%rsi), %xmm0 - movdqu %xmm0, 20(%rdi) -.Lbwd_write_20bytes: - lddqu 4(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 4(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_4bytes: - mov (%rsi), %edx - mov %edx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_131bytes: - lddqu 115(%rsi), %xmm0 - movdqu %xmm0, 115(%rdi) -.Lbwd_write_115bytes: - lddqu 99(%rsi), %xmm0 - movdqu %xmm0, 99(%rdi) -.Lbwd_write_99bytes: - lddqu 83(%rsi), %xmm0 - movdqu %xmm0, 83(%rdi) -.Lbwd_write_83bytes: - lddqu 67(%rsi), %xmm0 - movdqu %xmm0, 67(%rdi) -.Lbwd_write_67bytes: - lddqu 51(%rsi), %xmm0 - movdqu %xmm0, 51(%rdi) -.Lbwd_write_51bytes: - lddqu 35(%rsi), %xmm0 - movdqu %xmm0, 35(%rdi) -.Lbwd_write_35bytes: - lddqu 19(%rsi), %xmm0 - movdqu %xmm0, 19(%rdi) -.Lbwd_write_19bytes: - lddqu 3(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 3(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_3bytes: - mov 1(%rsi), %dx - mov (%rsi), %cx - mov %dx, 1(%rdi) - mov %cx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_130bytes: - lddqu 114(%rsi), %xmm0 - movdqu %xmm0, 114(%rdi) -.Lbwd_write_114bytes: - lddqu 98(%rsi), %xmm0 - movdqu %xmm0, 98(%rdi) -.Lbwd_write_98bytes: - lddqu 82(%rsi), %xmm0 - movdqu %xmm0, 82(%rdi) -.Lbwd_write_82bytes: - lddqu 66(%rsi), %xmm0 - movdqu %xmm0, 66(%rdi) -.Lbwd_write_66bytes: - lddqu 50(%rsi), %xmm0 - movdqu %xmm0, 50(%rdi) -.Lbwd_write_50bytes: - lddqu 34(%rsi), %xmm0 - movdqu %xmm0, 34(%rdi) -.Lbwd_write_34bytes: - lddqu 18(%rsi), %xmm0 - movdqu %xmm0, 18(%rdi) -.Lbwd_write_18bytes: - lddqu 2(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 2(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_2bytes: - movzwl (%rsi), %edx - mov %dx, (%rdi) - ret - - .p2align 4 -.Lbwd_write_129bytes: - lddqu 113(%rsi), %xmm0 - movdqu %xmm0, 113(%rdi) -.Lbwd_write_113bytes: - lddqu 97(%rsi), %xmm0 - movdqu %xmm0, 97(%rdi) -.Lbwd_write_97bytes: - lddqu 81(%rsi), %xmm0 - movdqu %xmm0, 81(%rdi) -.Lbwd_write_81bytes: - lddqu 65(%rsi), %xmm0 - movdqu %xmm0, 65(%rdi) -.Lbwd_write_65bytes: - lddqu 49(%rsi), %xmm0 - movdqu %xmm0, 49(%rdi) -.Lbwd_write_49bytes: - lddqu 33(%rsi), %xmm0 - movdqu %xmm0, 33(%rdi) -.Lbwd_write_33bytes: - lddqu 17(%rsi), %xmm0 - movdqu %xmm0, 17(%rdi) -.Lbwd_write_17bytes: - lddqu 1(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 1(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -.Lbwd_write_1bytes: - movzbl (%rsi), %edx - mov %dl, (%rdi) - ret - - .section .rodata.ssse3 - .p2align 3 -.Ltable_144_bytes_bwd: - .int .Lbwd_write_0bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_1bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_2bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_3bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_4bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_5bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_6bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_7bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_8bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_9bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_10bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_11bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_12bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_13bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_14bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_15bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_16bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_17bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_18bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_19bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_20bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_21bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_22bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_23bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_24bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_25bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_26bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_27bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_28bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_29bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_30bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_31bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_32bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_33bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_34bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_35bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_36bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_37bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_38bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_39bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_40bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_41bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_42bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_43bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_44bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_45bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_46bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_47bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_48bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_49bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_50bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_51bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_52bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_53bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_54bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_55bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_56bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_57bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_58bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_59bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_60bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_61bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_62bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_63bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_64bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_65bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_66bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_67bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_68bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_69bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_70bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_71bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_72bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_73bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_74bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_75bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_76bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_77bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_78bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_79bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_80bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_81bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_82bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_83bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_84bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_85bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_86bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_87bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_88bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_89bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_90bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_91bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_92bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_93bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_94bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_95bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_96bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_97bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_98bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_99bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_100bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_101bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_102bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_103bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_104bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_105bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_106bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_107bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_108bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_109bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_110bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_111bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_112bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_113bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_114bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_115bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_116bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_117bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_118bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_119bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_120bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_121bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_122bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_123bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_124bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_125bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_126bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_127bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_128bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_129bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_130bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_131bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_132bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_133bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_134bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_135bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_136bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_137bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_138bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_139bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_140bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_141bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_142bytes - .Ltable_144_bytes_bwd - .int .Lbwd_write_143bytes - .Ltable_144_bytes_bwd - - .p2align 3 -.Ltable_144_bytes_fwd: - .int .Lfwd_write_0bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_1bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_2bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_3bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_4bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_5bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_6bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_7bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_8bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_9bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_10bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_11bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_12bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_13bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_14bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_15bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_16bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_17bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_18bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_19bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_20bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_21bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_22bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_23bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_24bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_25bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_26bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_27bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_28bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_29bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_30bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_31bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_32bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_33bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_34bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_35bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_36bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_37bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_38bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_39bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_40bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_41bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_42bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_43bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_44bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_45bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_46bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_47bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_48bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_49bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_50bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_51bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_52bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_53bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_54bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_55bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_56bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_57bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_58bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_59bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_60bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_61bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_62bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_63bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_64bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_65bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_66bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_67bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_68bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_69bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_70bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_71bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_72bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_73bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_74bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_75bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_76bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_77bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_78bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_79bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_80bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_81bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_82bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_83bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_84bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_85bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_86bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_87bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_88bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_89bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_90bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_91bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_92bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_93bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_94bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_95bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_96bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_97bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_98bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_99bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_100bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_101bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_102bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_103bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_104bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_105bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_106bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_107bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_108bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_109bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_110bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_111bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_112bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_113bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_114bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_115bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_116bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_117bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_118bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_119bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_120bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_121bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_122bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_123bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_124bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_125bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_126bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_127bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_128bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_129bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_130bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_131bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_132bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_133bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_134bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_135bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_136bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_137bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_138bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_139bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_140bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_141bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_142bytes - .Ltable_144_bytes_fwd - .int .Lfwd_write_143bytes - .Ltable_144_bytes_fwd - - .p2align 3 -.Lshl_table_fwd: - .int .Lshl_0 - .Lshl_table_fwd - .int .Lshl_1 - .Lshl_table_fwd - .int .Lshl_2 - .Lshl_table_fwd - .int .Lshl_3 - .Lshl_table_fwd - .int .Lshl_4 - .Lshl_table_fwd - .int .Lshl_5 - .Lshl_table_fwd - .int .Lshl_6 - .Lshl_table_fwd - .int .Lshl_7 - .Lshl_table_fwd - .int .Lshl_8 - .Lshl_table_fwd - .int .Lshl_9 - .Lshl_table_fwd - .int .Lshl_10 - .Lshl_table_fwd - .int .Lshl_11 - .Lshl_table_fwd - .int .Lshl_12 - .Lshl_table_fwd - .int .Lshl_13 - .Lshl_table_fwd - .int .Lshl_14 - .Lshl_table_fwd - .int .Lshl_15 - .Lshl_table_fwd - - .p2align 3 -.Lshl_table_bwd: - .int .Lshl_0_bwd - .Lshl_table_bwd - .int .Lshl_1_bwd - .Lshl_table_bwd - .int .Lshl_2_bwd - .Lshl_table_bwd - .int .Lshl_3_bwd - .Lshl_table_bwd - .int .Lshl_4_bwd - .Lshl_table_bwd - .int .Lshl_5_bwd - .Lshl_table_bwd - .int .Lshl_6_bwd - .Lshl_table_bwd - .int .Lshl_7_bwd - .Lshl_table_bwd - .int .Lshl_8_bwd - .Lshl_table_bwd - .int .Lshl_9_bwd - .Lshl_table_bwd - .int .Lshl_10_bwd - .Lshl_table_bwd - .int .Lshl_11_bwd - .Lshl_table_bwd - .int .Lshl_12_bwd - .Lshl_table_bwd - .int .Lshl_13_bwd - .Lshl_table_bwd - .int .Lshl_14_bwd - .Lshl_table_bwd - .int .Lshl_15_bwd - .Lshl_table_bwd diff --git a/impls/memmove-ssse3.s b/impls/memmove-ssse3.s index 86e1a84..98fa4fc 100644 --- a/impls/memmove-ssse3.s +++ b/impls/memmove-ssse3.s @@ -1,3052 +1,259 @@ .section .text.ssse3 + +.globl __mempcpy_ssse3 +__mempcpy_ssse3: + mov %rdi, %rax + add %rdx, %rax + jmp .Lstart + .globl __memmove_ssse3 __memmove_ssse3: - mov %rdi, %rax - cmp %rsi, %rdi - jb .Lcopy_forward - je .Lwrite_0bytes - cmp $79, %rdx - jbe .Lcopy_forward - jmp .Lcopy_backward -.Lcopy_forward: + movq %rdi, %rax .Lstart: - cmp $79, %rdx - lea .Ltable_less_80bytes(%rip), %r11 - ja .L80bytesormore - movslq (%r11, %rdx, 4), %r9 - add %rdx, %rsi - add %rdx, %rdi - add %r11, %r9 - jmp *%r9 - ud2 - - .p2align 4 -.L80bytesormore: - - movdqu (%rsi), %xmm0 - mov %rdi, %rcx - and $-16, %rdi - add $16, %rdi - mov %rcx, %r8 - sub %rdi, %rcx - add %rcx, %rdx - sub %rcx, %rsi - - mov __x86_shared_cache_size_half(%rip), %rcx - - cmp %rcx, %rdx - mov %rsi, %r9 - ja .Llarge_page_fwd - and $0xf, %r9 - jz .Lshl_0 - - mov __x86_data_cache_size_half(%rip), %rcx - - lea .Lshl_table(%rip), %r11; movslq (%r11, %r9, 4), %r9; lea (%r11, %r9), %r9; jmp *%r9; ud2 - - .p2align 4 -.Lcopy_backward: - movdqu -16(%rsi, %rdx), %xmm0 - add %rdx, %rsi - lea -16(%rdi, %rdx), %r8 - add %rdx, %rdi - - mov %rdi, %rcx - and $0xf, %rcx - xor %rcx, %rdi - sub %rcx, %rdx - sub %rcx, %rsi - - mov __x86_shared_cache_size_half(%rip), %rcx - - cmp %rcx, %rdx - mov %rsi, %r9 - ja .Llarge_page_bwd - and $0xf, %r9 - jz .Lshl_0_bwd - - mov __x86_data_cache_size_half(%rip), %rcx - - lea .Lshl_table_bwd(%rip), %r11; movslq (%r11, %r9, 4), %r9; lea (%r11, %r9), %r9; jmp *%r9; ud2 - - .p2align 4 -.Lshl_0: - sub $16, %rdx - movdqa (%rsi), %xmm1 - add $16, %rsi - movdqa %xmm1, (%rdi) - add $16, %rdi - cmp $128, %rdx - movdqu %xmm0, (%r8) - ja .Lshl_0_gobble - cmp $64, %rdx - jb .Lshl_0_less_64bytes - movaps (%rsi), %xmm4 - movaps 16(%rsi), %xmm1 - movaps 32(%rsi), %xmm2 - movaps 48(%rsi), %xmm3 - movaps %xmm4, (%rdi) - movaps %xmm1, 16(%rdi) - movaps %xmm2, 32(%rdi) - movaps %xmm3, 48(%rdi) - sub $64, %rdx - add $64, %rsi - add $64, %rdi -.Lshl_0_less_64bytes: - add %rdx, %rsi - add %rdx, %rdi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_0_gobble: - - cmp __x86_data_cache_size_half(%rip), %rdx - - lea -128(%rdx), %rdx - jae .Lshl_0_gobble_mem_loop -.Lshl_0_gobble_cache_loop: - movdqa (%rsi), %xmm4 - movaps 0x10(%rsi), %xmm1 - movaps 0x20(%rsi), %xmm2 - movaps 0x30(%rsi), %xmm3 - - movdqa %xmm4, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - - sub $128, %rdx - movaps 0x40(%rsi), %xmm4 - movaps 0x50(%rsi), %xmm5 - movaps 0x60(%rsi), %xmm6 - movaps 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - movaps %xmm4, 0x40(%rdi) - movaps %xmm5, 0x50(%rdi) - movaps %xmm6, 0x60(%rdi) - movaps %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - - jae .Lshl_0_gobble_cache_loop - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl .Lshl_0_cache_less_64bytes - - movdqa (%rsi), %xmm4 - sub $0x40, %rdx - movdqa 0x10(%rsi), %xmm1 - - movdqa %xmm4, (%rdi) - movdqa %xmm1, 0x10(%rdi) - - movdqa 0x20(%rsi), %xmm4 - movdqa 0x30(%rsi), %xmm1 - add $0x40, %rsi - - movdqa %xmm4, 0x20(%rdi) - movdqa %xmm1, 0x30(%rdi) - add $0x40, %rdi -.Lshl_0_cache_less_64bytes: - add %rdx, %rsi - add %rdx, %rdi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_0_gobble_mem_loop: - prefetcht0 0x1c0(%rsi) - prefetcht0 0x280(%rsi) - - movdqa (%rsi), %xmm0 - movdqa 0x10(%rsi), %xmm1 - movdqa 0x20(%rsi), %xmm2 - movdqa 0x30(%rsi), %xmm3 - movdqa 0x40(%rsi), %xmm4 - movdqa 0x50(%rsi), %xmm5 - movdqa 0x60(%rsi), %xmm6 - movdqa 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - sub $0x80, %rdx - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - movdqa %xmm2, 0x20(%rdi) - movdqa %xmm3, 0x30(%rdi) - movdqa %xmm4, 0x40(%rdi) - movdqa %xmm5, 0x50(%rdi) - movdqa %xmm6, 0x60(%rdi) - movdqa %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - - jae .Lshl_0_gobble_mem_loop - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl .Lshl_0_mem_less_64bytes - - movdqa (%rsi), %xmm0 - sub $0x40, %rdx - movdqa 0x10(%rsi), %xmm1 - - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - - movdqa 0x20(%rsi), %xmm0 - movdqa 0x30(%rsi), %xmm1 - add $0x40, %rsi - - movdqa %xmm0, 0x20(%rdi) - movdqa %xmm1, 0x30(%rdi) - add $0x40, %rdi -.Lshl_0_mem_less_64bytes: - cmp $0x20, %rdx - jb .Lshl_0_mem_less_32bytes - movdqa (%rsi), %xmm0 - sub $0x20, %rdx - movdqa 0x10(%rsi), %xmm1 - add $0x20, %rsi - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - add $0x20, %rdi -.Lshl_0_mem_less_32bytes: - add %rdx, %rdi - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_0_bwd: - sub $16, %rdx - movdqa -0x10(%rsi), %xmm1 - sub $16, %rsi - movdqa %xmm1, -0x10(%rdi) - sub $16, %rdi - cmp $0x80, %rdx - movdqu %xmm0, (%r8) - ja .Lshl_0_gobble_bwd - cmp $64, %rdx - jb .Lshl_0_less_64bytes_bwd - movaps -0x10(%rsi), %xmm0 - movaps -0x20(%rsi), %xmm1 - movaps -0x30(%rsi), %xmm2 - movaps -0x40(%rsi), %xmm3 - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - sub $64, %rdx - sub $0x40, %rsi - sub $0x40, %rdi -.Lshl_0_less_64bytes_bwd: - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_0_gobble_bwd: - - cmp __x86_data_cache_size_half(%rip), %rdx - - lea -128(%rdx), %rdx - jae .Lshl_0_gobble_mem_bwd_loop -.Lshl_0_gobble_bwd_loop: - movdqa -0x10(%rsi), %xmm0 - movaps -0x20(%rsi), %xmm1 - movaps -0x30(%rsi), %xmm2 - movaps -0x40(%rsi), %xmm3 - - movdqa %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - - sub $0x80, %rdx - movaps -0x50(%rsi), %xmm4 - movaps -0x60(%rsi), %xmm5 - movaps -0x70(%rsi), %xmm6 - movaps -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - movaps %xmm4, -0x50(%rdi) - movaps %xmm5, -0x60(%rdi) - movaps %xmm6, -0x70(%rdi) - movaps %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - - jae .Lshl_0_gobble_bwd_loop - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl .Lshl_0_gobble_bwd_less_64bytes - - movdqa -0x10(%rsi), %xmm0 - sub $0x40, %rdx - movdqa -0x20(%rsi), %xmm1 - - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - - movdqa -0x30(%rsi), %xmm0 - movdqa -0x40(%rsi), %xmm1 - sub $0x40, %rsi - - movdqa %xmm0, -0x30(%rdi) - movdqa %xmm1, -0x40(%rdi) - sub $0x40, %rdi -.Lshl_0_gobble_bwd_less_64bytes: - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_0_gobble_mem_bwd_loop: - prefetcht0 -0x1c0(%rsi) - prefetcht0 -0x280(%rsi) - movdqa -0x10(%rsi), %xmm0 - movdqa -0x20(%rsi), %xmm1 - movdqa -0x30(%rsi), %xmm2 - movdqa -0x40(%rsi), %xmm3 - movdqa -0x50(%rsi), %xmm4 - movdqa -0x60(%rsi), %xmm5 - movdqa -0x70(%rsi), %xmm6 - movdqa -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - sub $0x80, %rdx - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - movdqa %xmm2, -0x30(%rdi) - movdqa %xmm3, -0x40(%rdi) - movdqa %xmm4, -0x50(%rdi) - movdqa %xmm5, -0x60(%rdi) - movdqa %xmm6, -0x70(%rdi) - movdqa %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - - jae .Lshl_0_gobble_mem_bwd_loop - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl .Lshl_0_mem_bwd_less_64bytes - - movdqa -0x10(%rsi), %xmm0 - sub $0x40, %rdx - movdqa -0x20(%rsi), %xmm1 - - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - - movdqa -0x30(%rsi), %xmm0 - movdqa -0x40(%rsi), %xmm1 - sub $0x40, %rsi - - movdqa %xmm0, -0x30(%rdi) - movdqa %xmm1, -0x40(%rdi) - sub $0x40, %rdi -.Lshl_0_mem_bwd_less_64bytes: - cmp $0x20, %rdx - jb .Lshl_0_mem_bwd_less_32bytes - movdqa -0x10(%rsi), %xmm0 - sub $0x20, %rdx - movdqa -0x20(%rsi), %xmm1 - sub $0x20, %rsi - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - sub $0x20, %rdi -.Lshl_0_mem_bwd_less_32bytes: - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_1: - lea (.Lshl_1_loop_L1-.Lshl_1)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x01(%rsi), %xmm1 - jb .LL1_fwd - lea (.Lshl_1_loop_L2-.Lshl_1_loop_L1)(%r9), %r9 -.LL1_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_1_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_1_loop_L1: - sub $64, %rdx - movaps 0x0f(%rsi), %xmm2 - movaps 0x1f(%rsi), %xmm3 - movaps 0x2f(%rsi), %xmm4 - movaps 0x3f(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $1, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $1, %xmm3, %xmm4 - palignr $1, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $1, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_1_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_1_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_1_bwd: - lea (.Lshl_1_bwd_loop_L1-.Lshl_1_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x01(%rsi), %xmm1 - jb .LL1_bwd - lea (.Lshl_1_bwd_loop_L2-.Lshl_1_bwd_loop_L1)(%r9), %r9 -.LL1_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_1_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_1_bwd_loop_L1: - movaps -0x11(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x21(%rsi), %xmm3 - movaps -0x31(%rsi), %xmm4 - movaps -0x41(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $1, %xmm2, %xmm1 - palignr $1, %xmm3, %xmm2 - palignr $1, %xmm4, %xmm3 - palignr $1, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_1_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_1_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_2: - lea (.Lshl_2_loop_L1-.Lshl_2)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x02(%rsi), %xmm1 - jb .LL2_fwd - lea (.Lshl_2_loop_L2-.Lshl_2_loop_L1)(%r9), %r9 -.LL2_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_2_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_2_loop_L1: - sub $64, %rdx - movaps 0x0e(%rsi), %xmm2 - movaps 0x1e(%rsi), %xmm3 - movaps 0x2e(%rsi), %xmm4 - movaps 0x3e(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $2, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $2, %xmm3, %xmm4 - palignr $2, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $2, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_2_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_2_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_2_bwd: - lea (.Lshl_2_bwd_loop_L1-.Lshl_2_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x02(%rsi), %xmm1 - jb .LL2_bwd - lea (.Lshl_2_bwd_loop_L2-.Lshl_2_bwd_loop_L1)(%r9), %r9 -.LL2_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_2_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_2_bwd_loop_L1: - movaps -0x12(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x22(%rsi), %xmm3 - movaps -0x32(%rsi), %xmm4 - movaps -0x42(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $2, %xmm2, %xmm1 - palignr $2, %xmm3, %xmm2 - palignr $2, %xmm4, %xmm3 - palignr $2, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_2_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_2_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_3: - lea (.Lshl_3_loop_L1-.Lshl_3)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x03(%rsi), %xmm1 - jb .LL3_fwd - lea (.Lshl_3_loop_L2-.Lshl_3_loop_L1)(%r9), %r9 -.LL3_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_3_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_3_loop_L1: - sub $64, %rdx - movaps 0x0d(%rsi), %xmm2 - movaps 0x1d(%rsi), %xmm3 - movaps 0x2d(%rsi), %xmm4 - movaps 0x3d(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $3, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $3, %xmm3, %xmm4 - palignr $3, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $3, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_3_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_3_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_3_bwd: - lea (.Lshl_3_bwd_loop_L1-.Lshl_3_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x03(%rsi), %xmm1 - jb .LL3_bwd - lea (.Lshl_3_bwd_loop_L2-.Lshl_3_bwd_loop_L1)(%r9), %r9 -.LL3_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_3_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_3_bwd_loop_L1: - movaps -0x13(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x23(%rsi), %xmm3 - movaps -0x33(%rsi), %xmm4 - movaps -0x43(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $3, %xmm2, %xmm1 - palignr $3, %xmm3, %xmm2 - palignr $3, %xmm4, %xmm3 - palignr $3, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_3_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_3_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_4: - lea (.Lshl_4_loop_L1-.Lshl_4)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x04(%rsi), %xmm1 - jb .LL4_fwd - lea (.Lshl_4_loop_L2-.Lshl_4_loop_L1)(%r9), %r9 -.LL4_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_4_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_4_loop_L1: - sub $64, %rdx - movaps 0x0c(%rsi), %xmm2 - movaps 0x1c(%rsi), %xmm3 - movaps 0x2c(%rsi), %xmm4 - movaps 0x3c(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $4, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $4, %xmm3, %xmm4 - palignr $4, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $4, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_4_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_4_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_4_bwd: - lea (.Lshl_4_bwd_loop_L1-.Lshl_4_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x04(%rsi), %xmm1 - jb .LL4_bwd - lea (.Lshl_4_bwd_loop_L2-.Lshl_4_bwd_loop_L1)(%r9), %r9 -.LL4_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_4_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_4_bwd_loop_L1: - movaps -0x14(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x24(%rsi), %xmm3 - movaps -0x34(%rsi), %xmm4 - movaps -0x44(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $4, %xmm2, %xmm1 - palignr $4, %xmm3, %xmm2 - palignr $4, %xmm4, %xmm3 - palignr $4, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_4_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_4_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_5: - lea (.Lshl_5_loop_L1-.Lshl_5)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x05(%rsi), %xmm1 - jb .LL5_fwd - lea (.Lshl_5_loop_L2-.Lshl_5_loop_L1)(%r9), %r9 -.LL5_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_5_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_5_loop_L1: - sub $64, %rdx - movaps 0x0b(%rsi), %xmm2 - movaps 0x1b(%rsi), %xmm3 - movaps 0x2b(%rsi), %xmm4 - movaps 0x3b(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $5, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $5, %xmm3, %xmm4 - palignr $5, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $5, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_5_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_5_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_5_bwd: - lea (.Lshl_5_bwd_loop_L1-.Lshl_5_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x05(%rsi), %xmm1 - jb .LL5_bwd - lea (.Lshl_5_bwd_loop_L2-.Lshl_5_bwd_loop_L1)(%r9), %r9 -.LL5_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_5_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_5_bwd_loop_L1: - movaps -0x15(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x25(%rsi), %xmm3 - movaps -0x35(%rsi), %xmm4 - movaps -0x45(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $5, %xmm2, %xmm1 - palignr $5, %xmm3, %xmm2 - palignr $5, %xmm4, %xmm3 - palignr $5, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_5_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_5_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_6: - lea (.Lshl_6_loop_L1-.Lshl_6)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x06(%rsi), %xmm1 - jb .LL6_fwd - lea (.Lshl_6_loop_L2-.Lshl_6_loop_L1)(%r9), %r9 -.LL6_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_6_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_6_loop_L1: - sub $64, %rdx - movaps 0x0a(%rsi), %xmm2 - movaps 0x1a(%rsi), %xmm3 - movaps 0x2a(%rsi), %xmm4 - movaps 0x3a(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $6, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $6, %xmm3, %xmm4 - palignr $6, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $6, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_6_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_6_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_6_bwd: - lea (.Lshl_6_bwd_loop_L1-.Lshl_6_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x06(%rsi), %xmm1 - jb .LL6_bwd - lea (.Lshl_6_bwd_loop_L2-.Lshl_6_bwd_loop_L1)(%r9), %r9 -.LL6_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_6_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_6_bwd_loop_L1: - movaps -0x16(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x26(%rsi), %xmm3 - movaps -0x36(%rsi), %xmm4 - movaps -0x46(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $6, %xmm2, %xmm1 - palignr $6, %xmm3, %xmm2 - palignr $6, %xmm4, %xmm3 - palignr $6, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_6_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_6_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_7: - lea (.Lshl_7_loop_L1-.Lshl_7)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x07(%rsi), %xmm1 - jb .LL7_fwd - lea (.Lshl_7_loop_L2-.Lshl_7_loop_L1)(%r9), %r9 -.LL7_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_7_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_7_loop_L1: - sub $64, %rdx - movaps 0x09(%rsi), %xmm2 - movaps 0x19(%rsi), %xmm3 - movaps 0x29(%rsi), %xmm4 - movaps 0x39(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $7, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $7, %xmm3, %xmm4 - palignr $7, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $7, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_7_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_7_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_7_bwd: - lea (.Lshl_7_bwd_loop_L1-.Lshl_7_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x07(%rsi), %xmm1 - jb .LL7_bwd - lea (.Lshl_7_bwd_loop_L2-.Lshl_7_bwd_loop_L1)(%r9), %r9 -.LL7_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_7_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_7_bwd_loop_L1: - movaps -0x17(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x27(%rsi), %xmm3 - movaps -0x37(%rsi), %xmm4 - movaps -0x47(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $7, %xmm2, %xmm1 - palignr $7, %xmm3, %xmm2 - palignr $7, %xmm4, %xmm3 - palignr $7, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_7_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_7_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_8: - lea (.Lshl_8_loop_L1-.Lshl_8)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x08(%rsi), %xmm1 - jb .LL8_fwd - lea (.Lshl_8_loop_L2-.Lshl_8_loop_L1)(%r9), %r9 -.LL8_fwd: - lea -64(%rdx), %rdx - jmp *%r9 -.Lshl_8_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_8_loop_L1: - sub $64, %rdx - movaps 0x08(%rsi), %xmm2 - movaps 0x18(%rsi), %xmm3 - movaps 0x28(%rsi), %xmm4 - movaps 0x38(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $8, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $8, %xmm3, %xmm4 - palignr $8, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $8, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_8_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 - .p2align 4 -.Lshl_8_end: - lea 64(%rdx), %rdx - movaps %xmm4, -0x20(%rdi) - add %rdx, %rsi - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_8_bwd: - lea (.Lshl_8_bwd_loop_L1-.Lshl_8_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x08(%rsi), %xmm1 - jb .LL8_bwd - lea (.Lshl_8_bwd_loop_L2-.Lshl_8_bwd_loop_L1)(%r9), %r9 -.LL8_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_8_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_8_bwd_loop_L1: - movaps -0x18(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x28(%rsi), %xmm3 - movaps -0x38(%rsi), %xmm4 - movaps -0x48(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $8, %xmm2, %xmm1 - palignr $8, %xmm3, %xmm2 - palignr $8, %xmm4, %xmm3 - palignr $8, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_8_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_8_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_9: - lea (.Lshl_9_loop_L1-.Lshl_9)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x09(%rsi), %xmm1 - jb .LL9_fwd - lea (.Lshl_9_loop_L2-.Lshl_9_loop_L1)(%r9), %r9 -.LL9_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_9_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_9_loop_L1: - sub $64, %rdx - movaps 0x07(%rsi), %xmm2 - movaps 0x17(%rsi), %xmm3 - movaps 0x27(%rsi), %xmm4 - movaps 0x37(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $9, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $9, %xmm3, %xmm4 - palignr $9, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $9, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_9_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_9_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_9_bwd: - lea (.Lshl_9_bwd_loop_L1-.Lshl_9_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x09(%rsi), %xmm1 - jb .LL9_bwd - lea (.Lshl_9_bwd_loop_L2-.Lshl_9_bwd_loop_L1)(%r9), %r9 -.LL9_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_9_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_9_bwd_loop_L1: - movaps -0x19(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x29(%rsi), %xmm3 - movaps -0x39(%rsi), %xmm4 - movaps -0x49(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $9, %xmm2, %xmm1 - palignr $9, %xmm3, %xmm2 - palignr $9, %xmm4, %xmm3 - palignr $9, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_9_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_9_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_10: - lea (.Lshl_10_loop_L1-.Lshl_10)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0a(%rsi), %xmm1 - jb .LL10_fwd - lea (.Lshl_10_loop_L2-.Lshl_10_loop_L1)(%r9), %r9 -.LL10_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_10_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_10_loop_L1: - sub $64, %rdx - movaps 0x06(%rsi), %xmm2 - movaps 0x16(%rsi), %xmm3 - movaps 0x26(%rsi), %xmm4 - movaps 0x36(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $10, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $10, %xmm3, %xmm4 - palignr $10, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $10, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_10_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_10_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_10_bwd: - lea (.Lshl_10_bwd_loop_L1-.Lshl_10_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0a(%rsi), %xmm1 - jb .LL10_bwd - lea (.Lshl_10_bwd_loop_L2-.Lshl_10_bwd_loop_L1)(%r9), %r9 -.LL10_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_10_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_10_bwd_loop_L1: - movaps -0x1a(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2a(%rsi), %xmm3 - movaps -0x3a(%rsi), %xmm4 - movaps -0x4a(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $10, %xmm2, %xmm1 - palignr $10, %xmm3, %xmm2 - palignr $10, %xmm4, %xmm3 - palignr $10, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_10_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_10_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_11: - lea (.Lshl_11_loop_L1-.Lshl_11)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0b(%rsi), %xmm1 - jb .LL11_fwd - lea (.Lshl_11_loop_L2-.Lshl_11_loop_L1)(%r9), %r9 -.LL11_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_11_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_11_loop_L1: - sub $64, %rdx - movaps 0x05(%rsi), %xmm2 - movaps 0x15(%rsi), %xmm3 - movaps 0x25(%rsi), %xmm4 - movaps 0x35(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $11, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $11, %xmm3, %xmm4 - palignr $11, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $11, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_11_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_11_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_11_bwd: - lea (.Lshl_11_bwd_loop_L1-.Lshl_11_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0b(%rsi), %xmm1 - jb .LL11_bwd - lea (.Lshl_11_bwd_loop_L2-.Lshl_11_bwd_loop_L1)(%r9), %r9 -.LL11_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_11_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_11_bwd_loop_L1: - movaps -0x1b(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2b(%rsi), %xmm3 - movaps -0x3b(%rsi), %xmm4 - movaps -0x4b(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $11, %xmm2, %xmm1 - palignr $11, %xmm3, %xmm2 - palignr $11, %xmm4, %xmm3 - palignr $11, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_11_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_11_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_12: - lea (.Lshl_12_loop_L1-.Lshl_12)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0c(%rsi), %xmm1 - jb .LL12_fwd - lea (.Lshl_12_loop_L2-.Lshl_12_loop_L1)(%r9), %r9 -.LL12_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_12_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_12_loop_L1: - sub $64, %rdx - movaps 0x04(%rsi), %xmm2 - movaps 0x14(%rsi), %xmm3 - movaps 0x24(%rsi), %xmm4 - movaps 0x34(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $12, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $12, %xmm3, %xmm4 - palignr $12, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $12, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_12_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_12_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_12_bwd: - lea (.Lshl_12_bwd_loop_L1-.Lshl_12_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0c(%rsi), %xmm1 - jb .LL12_bwd - lea (.Lshl_12_bwd_loop_L2-.Lshl_12_bwd_loop_L1)(%r9), %r9 -.LL12_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_12_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_12_bwd_loop_L1: - movaps -0x1c(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2c(%rsi), %xmm3 - movaps -0x3c(%rsi), %xmm4 - movaps -0x4c(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $12, %xmm2, %xmm1 - palignr $12, %xmm3, %xmm2 - palignr $12, %xmm4, %xmm3 - palignr $12, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_12_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_12_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_13: - lea (.Lshl_13_loop_L1-.Lshl_13)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0d(%rsi), %xmm1 - jb .LL13_fwd - lea (.Lshl_13_loop_L2-.Lshl_13_loop_L1)(%r9), %r9 -.LL13_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_13_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_13_loop_L1: - sub $64, %rdx - movaps 0x03(%rsi), %xmm2 - movaps 0x13(%rsi), %xmm3 - movaps 0x23(%rsi), %xmm4 - movaps 0x33(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $13, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $13, %xmm3, %xmm4 - palignr $13, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $13, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_13_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_13_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_13_bwd: - lea (.Lshl_13_bwd_loop_L1-.Lshl_13_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0d(%rsi), %xmm1 - jb .LL13_bwd - lea (.Lshl_13_bwd_loop_L2-.Lshl_13_bwd_loop_L1)(%r9), %r9 -.LL13_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_13_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_13_bwd_loop_L1: - movaps -0x1d(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2d(%rsi), %xmm3 - movaps -0x3d(%rsi), %xmm4 - movaps -0x4d(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $13, %xmm2, %xmm1 - palignr $13, %xmm3, %xmm2 - palignr $13, %xmm4, %xmm3 - palignr $13, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_13_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_13_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_14: - lea (.Lshl_14_loop_L1-.Lshl_14)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0e(%rsi), %xmm1 - jb .LL14_fwd - lea (.Lshl_14_loop_L2-.Lshl_14_loop_L1)(%r9), %r9 -.LL14_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_14_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_14_loop_L1: - sub $64, %rdx - movaps 0x02(%rsi), %xmm2 - movaps 0x12(%rsi), %xmm3 - movaps 0x22(%rsi), %xmm4 - movaps 0x32(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $14, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $14, %xmm3, %xmm4 - palignr $14, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $14, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_14_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_14_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_14_bwd: - lea (.Lshl_14_bwd_loop_L1-.Lshl_14_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0e(%rsi), %xmm1 - jb .LL14_bwd - lea (.Lshl_14_bwd_loop_L2-.Lshl_14_bwd_loop_L1)(%r9), %r9 -.LL14_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_14_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_14_bwd_loop_L1: - movaps -0x1e(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2e(%rsi), %xmm3 - movaps -0x3e(%rsi), %xmm4 - movaps -0x4e(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $14, %xmm2, %xmm1 - palignr $14, %xmm3, %xmm2 - palignr $14, %xmm4, %xmm3 - palignr $14, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_14_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_14_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_15: - lea (.Lshl_15_loop_L1-.Lshl_15)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0f(%rsi), %xmm1 - jb .LL15_fwd - lea (.Lshl_15_loop_L2-.Lshl_15_loop_L1)(%r9), %r9 -.LL15_fwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_15_loop_L2: - prefetchnta 0x1c0(%rsi) -.Lshl_15_loop_L1: - sub $64, %rdx - movaps 0x01(%rsi), %xmm2 - movaps 0x11(%rsi), %xmm3 - movaps 0x21(%rsi), %xmm4 - movaps 0x31(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $15, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $15, %xmm3, %xmm4 - palignr $15, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $15, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb .Lshl_15_end - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - jmp *%r9 - ud2 -.Lshl_15_end: - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lshl_15_bwd: - lea (.Lshl_15_bwd_loop_L1-.Lshl_15_bwd)(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0f(%rsi), %xmm1 - jb .LL15_bwd - lea (.Lshl_15_bwd_loop_L2-.Lshl_15_bwd_loop_L1)(%r9), %r9 -.LL15_bwd: - lea -64(%rdx), %rdx - jmp *%r9 - ud2 -.Lshl_15_bwd_loop_L2: - prefetchnta -0x1c0(%rsi) -.Lshl_15_bwd_loop_L1: - movaps -0x1f(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2f(%rsi), %xmm3 - movaps -0x3f(%rsi), %xmm4 - movaps -0x4f(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $15, %xmm2, %xmm1 - palignr $15, %xmm3, %xmm2 - palignr $15, %xmm4, %xmm3 - palignr $15, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb .Lshl_15_bwd_end - movaps %xmm4, (%rdi) - jmp *%r9 - ud2 -.Lshl_15_bwd_end: - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lwrite_72bytes: - movdqu -72(%rsi), %xmm0 - movdqu -56(%rsi), %xmm1 - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rcx - movdqu %xmm0, -72(%rdi) - movdqu %xmm1, -56(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_64bytes: - movdqu -64(%rsi), %xmm0 - mov -48(%rsi), %rcx - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - movdqu %xmm0, -64(%rdi) - mov %rcx, -48(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_56bytes: - movdqu -56(%rsi), %xmm0 - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rcx - movdqu %xmm0, -56(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_48bytes: - mov -48(%rsi), %rcx - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %rcx, -48(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_40bytes: - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_32bytes: - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_24bytes: - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_16bytes: - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_8bytes: - mov -8(%rsi), %rdx - mov %rdx, -8(%rdi) -.Lwrite_0bytes: - ret - - .p2align 4 -.Lwrite_73bytes: - movdqu -73(%rsi), %xmm0 - movdqu -57(%rsi), %xmm1 - mov -41(%rsi), %rcx - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %r8 - mov -4(%rsi), %edx - movdqu %xmm0, -73(%rdi) - movdqu %xmm1, -57(%rdi) - mov %rcx, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %r8, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_65bytes: - movdqu -65(%rsi), %xmm0 - movdqu -49(%rsi), %xmm1 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -65(%rdi) - movdqu %xmm1, -49(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_57bytes: - movdqu -57(%rsi), %xmm0 - mov -41(%rsi), %r8 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -57(%rdi) - mov %r8, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_49bytes: - movdqu -49(%rsi), %xmm0 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -49(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_41bytes: - mov -41(%rsi), %r8 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r8, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -.Lwrite_33bytes: - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -.Lwrite_25bytes: - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -.Lwrite_17bytes: - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_9bytes: - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_1bytes: - mov -1(%rsi), %dl - mov %dl, -1(%rdi) - ret - - .p2align 4 -.Lwrite_74bytes: - movdqu -74(%rsi), %xmm0 - movdqu -58(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -74(%rdi) - movdqu %xmm1, -58(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_66bytes: - movdqu -66(%rsi), %xmm0 - movdqu -50(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -66(%rdi) - movdqu %xmm1, -50(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_58bytes: - movdqu -58(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm1, -58(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_50bytes: - movdqu -50(%rsi), %xmm0 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -50(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_42bytes: - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_34bytes: - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_26bytes: - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_18bytes: - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_10bytes: - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_2bytes: - mov -2(%rsi), %dx - mov %dx, -2(%rdi) - ret - - .p2align 4 -.Lwrite_75bytes: - movdqu -75(%rsi), %xmm0 - movdqu -59(%rsi), %xmm1 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -75(%rdi) - movdqu %xmm1, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_67bytes: - movdqu -67(%rsi), %xmm0 - movdqu -59(%rsi), %xmm1 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -67(%rdi) - movdqu %xmm1, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_59bytes: - movdqu -59(%rsi), %xmm0 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_51bytes: - movdqu -51(%rsi), %xmm0 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -51(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_43bytes: - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_35bytes: - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_27bytes: - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_19bytes: - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_11bytes: - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_3bytes: - mov -3(%rsi), %dx - mov -2(%rsi), %cx - mov %dx, -3(%rdi) - mov %cx, -2(%rdi) - ret - - .p2align 4 -.Lwrite_76bytes: - movdqu -76(%rsi), %xmm0 - movdqu -60(%rsi), %xmm1 - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -76(%rdi) - movdqu %xmm1, -60(%rdi) - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_68bytes: - movdqu -68(%rsi), %xmm0 - movdqu -52(%rsi), %xmm1 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -68(%rdi) - movdqu %xmm1, -52(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_60bytes: - movdqu -60(%rsi), %xmm0 - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -60(%rdi) - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_52bytes: - movdqu -52(%rsi), %xmm0 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -52(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_44bytes: - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_36bytes: - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_28bytes: - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_20bytes: - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -.Lwrite_12bytes: - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret + cmpq $16, %rdx + jb .Lcopy_0_15 - .p2align 4 -.Lwrite_4bytes: - mov -4(%rsi), %edx - mov %edx, -4(%rdi) - ret + movups 0(%rsi), %xmm0 + movups -16(%rsi, %rdx), %xmm7 + cmpq $32, %rdx + ja .Lmore_2x_vec - .p2align 4 -.Lwrite_77bytes: - movdqu -77(%rsi), %xmm0 - movdqu -61(%rsi), %xmm1 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -77(%rdi) - movdqu %xmm1, -61(%rdi) - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) + movups %xmm0, 0(%rdi) + movups %xmm7, -16(%rdi, %rdx) ret - .p2align 4 -.Lwrite_69bytes: - movdqu -69(%rsi), %xmm0 - movdqu -53(%rsi), %xmm1 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -69(%rdi) - movdqu %xmm1, -53(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) + .p2align 4,, 4 +.Lcopy_0_15: + cmpl $4, %edx + jb .Lcopy_0_3 + cmpl $8, %edx + jb .Lcopy_4_7 + movq 0(%rsi), %rcx + movq -8(%rsi, %rdx), %rsi + movq %rcx, 0(%rdi) + movq %rsi, -8(%rdi, %rdx) ret - .p2align 4 -.Lwrite_61bytes: - movdqu -61(%rsi), %xmm0 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -61(%rdi) - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) + .p2align 4,, 4 +.Lcopy_4_7: + movl 0(%rsi), %ecx + movl -4(%rsi, %rdx), %esi + movl %ecx, 0(%rdi) + movl %esi, -4(%rdi, %rdx) ret - .p2align 4 -.Lwrite_53bytes: - movdqu -53(%rsi), %xmm0 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -53(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret + .p2align 4,, 4 +.Lcopy_0_3: + decl %edx + jl .Lcopy_0_0 + movb (%rsi), %cl + je .Lcopy_1_1 - .p2align 4 -.Lwrite_45bytes: - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) + movzwl -1(%rsi, %rdx), %esi + movw %si, -1(%rdi, %rdx) +.Lcopy_1_1: + movb %cl, (%rdi) +.Lcopy_0_0: ret - .p2align 4 -.Lwrite_37bytes: - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret + .p2align 4,, 4 +.Lcopy_4x_vec: + movups 16(%rsi), %xmm1 + movups -32(%rsi, %rdx), %xmm2 - .p2align 4 -.Lwrite_29bytes: - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) + movups %xmm0, 0(%rdi) + movups %xmm1, 16(%rdi) + movups %xmm2, -32(%rdi, %rdx) + movups %xmm7, -16(%rdi, %rdx) +.Lnop: ret .p2align 4 -.Lwrite_21bytes: - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret +.Lmore_2x_vec: + cmpq $64, %rdx + jbe .Lcopy_4x_vec - .p2align 4 -.Lwrite_13bytes: - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret + movq %rdi, %rcx - .p2align 4 -.Lwrite_5bytes: - mov -5(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -5(%rdi) - mov %ecx, -4(%rdi) - ret + subq %rsi, %rcx + cmpq %rdx, %rcx + jb .Lcopy_backward - .p2align 4 -.Lwrite_78bytes: - movdqu -78(%rsi), %xmm0 - movdqu -62(%rsi), %xmm1 - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -78(%rdi) - movdqu %xmm1, -62(%rdi) - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret + movups -32(%rsi, %rdx), %xmm8 + movups -48(%rsi, %rdx), %xmm9 - .p2align 4 -.Lwrite_70bytes: - movdqu -70(%rsi), %xmm0 - movdqu -54(%rsi), %xmm1 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -70(%rdi) - movdqu %xmm1, -54(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret + andl $0xf, %ecx - .p2align 4 -.Lwrite_62bytes: - movdqu -62(%rsi), %xmm0 - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -62(%rdi) - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret + movq %rsi, %r9 + addq %rcx, %rsi + andq $-16, %rsi - .p2align 4 -.Lwrite_54bytes: - movdqu -54(%rsi), %xmm0 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -54(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret + movaps (%rsi), %xmm1 - .p2align 4 -.Lwrite_46bytes: - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret + movups %xmm0, (%rdi) - .p2align 4 -.Lwrite_38bytes: - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_30bytes: - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_22bytes: - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_14bytes: - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_6bytes: - mov -6(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -6(%rdi) - mov %ecx, -4(%rdi) - ret + cmp __x86_shared_cache_size_half(%rip), %rdx - .p2align 4 -.Lwrite_79bytes: - movdqu -79(%rsi), %xmm0 - movdqu -63(%rsi), %xmm1 - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -79(%rdi) - movdqu %xmm1, -63(%rdi) - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret + ja .Llarge_memcpy - .p2align 4 -.Lwrite_71bytes: - movdqu -71(%rsi), %xmm0 - movdqu -55(%rsi), %xmm1 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -71(%rdi) - movdqu %xmm1, -55(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret + leaq -64(%rdi, %rdx), %r8 + andq $-16, %rdi + movl $48, %edx - .p2align 4 -.Lwrite_63bytes: - movdqu -63(%rsi), %xmm0 - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -63(%rdi) - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_55bytes: - movdqu -55(%rsi), %xmm0 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -55(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_47bytes: - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret + leaq .Lloop_fwd_start(%rip), %r9 + sall $6, %ecx + addq %r9, %rcx + jmp * %rcx - .p2align 4 -.Lwrite_39bytes: - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_31bytes: - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_23bytes: - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_15bytes: - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -.Lwrite_7bytes: - mov -7(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -7(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -.Llarge_page_fwd: - movdqu (%rsi), %xmm1 - lea 16(%rsi), %rsi - movdqu %xmm0, (%r8) - movntdq %xmm1, (%rdi) - lea 16(%rdi), %rdi - lea -0x90(%rdx), %rdx - - mov %rsi, %r9 - sub %rdi, %r9 - cmp %rdx, %r9 - jae .Lmemmove_is_memcpy_fwd - shl $2, %rcx - cmp %rcx, %rdx - jb .Lll_cache_copy_fwd_start -.Lmemmove_is_memcpy_fwd: - -.Llarge_page_loop: - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - - sub $0x80, %rdx - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - movntdq %xmm4, 0x40(%rdi) - movntdq %xmm5, 0x50(%rdi) - movntdq %xmm6, 0x60(%rdi) - movntdq %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - jae .Llarge_page_loop - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl .Llarge_page_less_64bytes - - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - lea 0x40(%rsi), %rsi - - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - lea 0x40(%rdi), %rdi - sub $0x40, %rdx -.Llarge_page_less_64bytes: - add %rdx, %rsi - add %rdx, %rdi - sfence - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lll_cache_copy_fwd_start: - prefetcht0 0x1c0(%rsi) - prefetcht0 0x200(%rsi) - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - - sub $0x80, %rdx - movaps %xmm0, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - movaps %xmm4, 0x40(%rdi) - movaps %xmm5, 0x50(%rdi) - movaps %xmm6, 0x60(%rdi) - movaps %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - jae .Lll_cache_copy_fwd_start - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl .Llarge_page_ll_less_fwd_64bytes - - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - lea 0x40(%rsi), %rsi - - movaps %xmm0, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - lea 0x40(%rdi), %rdi - sub $0x40, %rdx -.Llarge_page_ll_less_fwd_64bytes: - add %rdx, %rsi - add %rdx, %rdi - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Llarge_page_bwd: - movdqu -0x10(%rsi), %xmm1 - lea -16(%rsi), %rsi - movdqu %xmm0, (%r8) - movdqa %xmm1, -0x10(%rdi) - lea -16(%rdi), %rdi - lea -0x90(%rdx), %rdx - - mov %rdi, %r9 - sub %rsi, %r9 - cmp %rdx, %r9 - jae .Lmemmove_is_memcpy_bwd - cmp %rcx, %r9 - jb .Lll_cache_copy_bwd_start -.Lmemmove_is_memcpy_bwd: - -.Llarge_page_bwd_loop: - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - movdqu -0x50(%rsi), %xmm4 - movdqu -0x60(%rsi), %xmm5 - movdqu -0x70(%rsi), %xmm6 - movdqu -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - - sub $0x80, %rdx - movntdq %xmm0, -0x10(%rdi) - movntdq %xmm1, -0x20(%rdi) - movntdq %xmm2, -0x30(%rdi) - movntdq %xmm3, -0x40(%rdi) - movntdq %xmm4, -0x50(%rdi) - movntdq %xmm5, -0x60(%rdi) - movntdq %xmm6, -0x70(%rdi) - movntdq %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - jae .Llarge_page_bwd_loop - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl .Llarge_page_less_bwd_64bytes - - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - lea -0x40(%rsi), %rsi - - movntdq %xmm0, -0x10(%rdi) - movntdq %xmm1, -0x20(%rdi) - movntdq %xmm2, -0x30(%rdi) - movntdq %xmm3, -0x40(%rdi) - lea -0x40(%rdi), %rdi - sub $0x40, %rdx -.Llarge_page_less_bwd_64bytes: + .p2align 4,, 8 +.Lcopy_backward: + testq %rcx, %rcx + jz .Lnop + + movups 16(%rsi), %xmm4 + movups 32(%rsi), %xmm5 + + movq %rdi, %r8 + subq %rdi, %rsi + leaq -49(%rdi, %rdx), %rdi + andq $-16, %rdi + addq %rdi, %rsi + andq $-16, %rsi + + movaps 48(%rsi), %xmm6 + + leaq .Lloop_bkwd_start(%rip), %r9 + andl $0xf, %ecx + sall $6, %ecx + addq %r9, %rcx + jmp * %rcx + + .p2align 4,, 8 +.Llarge_memcpy: + movups -64(%r9, %rdx), %xmm10 + movups -80(%r9, %rdx), %xmm11 + + sall $5, %ecx + leal (%rcx, %rcx, 2), %r8d + leaq -96(%rdi, %rdx), %rcx + andq $-16, %rdi + leaq .Llarge_loop_fwd_start(%rip), %rdx + addq %r8, %rdx + jmp * %rdx + + .p2align 6 +.Lloop_fwd_start: +.Lloop_fwd_0x0: + movaps 16(%rsi), %xmm1 + movaps 32(%rsi), %xmm2 + movaps 48(%rsi), %xmm3 + movaps %xmm1, 16(%rdi) + movaps %xmm2, 32(%rdi) + movaps %xmm3, 48(%rdi) + addq %rdx, %rdi + addq %rdx, %rsi + cmpq %rdi, %r8 + ja .Lloop_fwd_0x0 +.Lend_loop_fwd: + movups %xmm9, 16(%r8) + movups %xmm8, 32(%r8) + movups %xmm7, 48(%r8) + ret + ; .p2align 6; .Lloop_fwd_0xf: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps %xmm3, %xmm4; palignr $0xf, %xmm2, %xmm3; palignr $0xf, %xmm0, %xmm2; palignr $0xf, %xmm1, %xmm0; movaps %xmm4, %xmm1; movaps %xmm0, 16(%rdi); movaps %xmm2, 32(%rdi); movaps %xmm3, 48(%rdi); addq %rdx, %rdi; addq %rdx, %rsi; cmpq %rdi, %r8; ja .Lloop_fwd_0xf; jmp .Lend_loop_fwd; + ; .p2align 6; .Lloop_fwd_0xe: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps %xmm3, %xmm4; palignr $0xe, %xmm2, %xmm3; palignr $0xe, %xmm0, %xmm2; palignr $0xe, %xmm1, %xmm0; movaps %xmm4, %xmm1; movaps %xmm0, 16(%rdi); movaps %xmm2, 32(%rdi); movaps %xmm3, 48(%rdi); addq %rdx, %rdi; addq %rdx, %rsi; cmpq %rdi, %r8; ja .Lloop_fwd_0xe; jmp .Lend_loop_fwd; + ; .p2align 6; .Lloop_fwd_0xd: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps %xmm3, %xmm4; palignr $0xd, %xmm2, %xmm3; palignr $0xd, %xmm0, %xmm2; palignr $0xd, %xmm1, %xmm0; movaps %xmm4, %xmm1; movaps %xmm0, 16(%rdi); movaps %xmm2, 32(%rdi); movaps %xmm3, 48(%rdi); addq %rdx, %rdi; addq %rdx, %rsi; cmpq %rdi, %r8; ja .Lloop_fwd_0xd; jmp .Lend_loop_fwd; + ; .p2align 6; .Lloop_fwd_0xc: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps %xmm3, %xmm4; palignr $0xc, %xmm2, %xmm3; palignr $0xc, %xmm0, %xmm2; palignr $0xc, %xmm1, %xmm0; movaps %xmm4, %xmm1; movaps %xmm0, 16(%rdi); movaps %xmm2, 32(%rdi); movaps %xmm3, 48(%rdi); addq %rdx, %rdi; addq %rdx, %rsi; cmpq %rdi, %r8; ja .Lloop_fwd_0xc; jmp .Lend_loop_fwd; + ; .p2align 6; .Lloop_fwd_0xb: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps %xmm3, %xmm4; palignr $0xb, %xmm2, %xmm3; palignr $0xb, %xmm0, %xmm2; palignr $0xb, %xmm1, %xmm0; movaps %xmm4, %xmm1; movaps %xmm0, 16(%rdi); movaps %xmm2, 32(%rdi); movaps %xmm3, 48(%rdi); addq %rdx, %rdi; addq %rdx, %rsi; cmpq %rdi, %r8; ja .Lloop_fwd_0xb; jmp .Lend_loop_fwd; + ; .p2align 6; .Lloop_fwd_0xa: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps %xmm3, %xmm4; palignr $0xa, %xmm2, %xmm3; palignr $0xa, %xmm0, %xmm2; palignr $0xa, %xmm1, %xmm0; movaps %xmm4, %xmm1; movaps %xmm0, 16(%rdi); movaps %xmm2, 32(%rdi); movaps %xmm3, 48(%rdi); addq %rdx, %rdi; addq %rdx, %rsi; cmpq %rdi, %r8; ja .Lloop_fwd_0xa; jmp .Lend_loop_fwd; + ; .p2align 6; .Lloop_fwd_0x9: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps %xmm3, %xmm4; palignr $0x9, %xmm2, %xmm3; palignr $0x9, %xmm0, %xmm2; palignr $0x9, %xmm1, %xmm0; movaps %xmm4, %xmm1; movaps %xmm0, 16(%rdi); movaps %xmm2, 32(%rdi); movaps %xmm3, 48(%rdi); addq %rdx, %rdi; addq %rdx, %rsi; cmpq %rdi, %r8; ja .Lloop_fwd_0x9; jmp .Lend_loop_fwd; + ; .p2align 6; .Lloop_fwd_0x8: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps %xmm3, %xmm4; palignr $0x8, %xmm2, %xmm3; palignr $0x8, %xmm0, %xmm2; palignr $0x8, %xmm1, %xmm0; movaps %xmm4, %xmm1; movaps %xmm0, 16(%rdi); movaps %xmm2, 32(%rdi); movaps %xmm3, 48(%rdi); addq %rdx, %rdi; addq %rdx, %rsi; cmpq %rdi, %r8; ja .Lloop_fwd_0x8; jmp .Lend_loop_fwd; + ; .p2align 6; .Lloop_fwd_0x7: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps %xmm3, %xmm4; palignr $0x7, %xmm2, %xmm3; palignr $0x7, %xmm0, %xmm2; palignr $0x7, %xmm1, %xmm0; movaps %xmm4, %xmm1; movaps %xmm0, 16(%rdi); movaps %xmm2, 32(%rdi); movaps %xmm3, 48(%rdi); addq %rdx, %rdi; addq %rdx, %rsi; cmpq %rdi, %r8; ja .Lloop_fwd_0x7; jmp .Lend_loop_fwd; + ; .p2align 6; .Lloop_fwd_0x6: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps %xmm3, %xmm4; palignr $0x6, %xmm2, %xmm3; palignr $0x6, %xmm0, %xmm2; palignr $0x6, %xmm1, %xmm0; movaps %xmm4, %xmm1; movaps %xmm0, 16(%rdi); movaps %xmm2, 32(%rdi); movaps %xmm3, 48(%rdi); addq %rdx, %rdi; addq %rdx, %rsi; cmpq %rdi, %r8; ja .Lloop_fwd_0x6; jmp .Lend_loop_fwd; + ; .p2align 6; .Lloop_fwd_0x5: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps %xmm3, %xmm4; palignr $0x5, %xmm2, %xmm3; palignr $0x5, %xmm0, %xmm2; palignr $0x5, %xmm1, %xmm0; movaps %xmm4, %xmm1; movaps %xmm0, 16(%rdi); movaps %xmm2, 32(%rdi); movaps %xmm3, 48(%rdi); addq %rdx, %rdi; addq %rdx, %rsi; cmpq %rdi, %r8; ja .Lloop_fwd_0x5; jmp .Lend_loop_fwd; + ; .p2align 6; .Lloop_fwd_0x4: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps %xmm3, %xmm4; palignr $0x4, %xmm2, %xmm3; palignr $0x4, %xmm0, %xmm2; palignr $0x4, %xmm1, %xmm0; movaps %xmm4, %xmm1; movaps %xmm0, 16(%rdi); movaps %xmm2, 32(%rdi); movaps %xmm3, 48(%rdi); addq %rdx, %rdi; addq %rdx, %rsi; cmpq %rdi, %r8; ja .Lloop_fwd_0x4; jmp .Lend_loop_fwd; + ; .p2align 6; .Lloop_fwd_0x3: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps %xmm3, %xmm4; palignr $0x3, %xmm2, %xmm3; palignr $0x3, %xmm0, %xmm2; palignr $0x3, %xmm1, %xmm0; movaps %xmm4, %xmm1; movaps %xmm0, 16(%rdi); movaps %xmm2, 32(%rdi); movaps %xmm3, 48(%rdi); addq %rdx, %rdi; addq %rdx, %rsi; cmpq %rdi, %r8; ja .Lloop_fwd_0x3; jmp .Lend_loop_fwd; + ; .p2align 6; .Lloop_fwd_0x2: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps %xmm3, %xmm4; palignr $0x2, %xmm2, %xmm3; palignr $0x2, %xmm0, %xmm2; palignr $0x2, %xmm1, %xmm0; movaps %xmm4, %xmm1; movaps %xmm0, 16(%rdi); movaps %xmm2, 32(%rdi); movaps %xmm3, 48(%rdi); addq %rdx, %rdi; addq %rdx, %rsi; cmpq %rdi, %r8; ja .Lloop_fwd_0x2; jmp .Lend_loop_fwd; + ; .p2align 6; .Lloop_fwd_0x1: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps %xmm3, %xmm4; palignr $0x1, %xmm2, %xmm3; palignr $0x1, %xmm0, %xmm2; palignr $0x1, %xmm1, %xmm0; movaps %xmm4, %xmm1; movaps %xmm0, 16(%rdi); movaps %xmm2, 32(%rdi); movaps %xmm3, 48(%rdi); addq %rdx, %rdi; addq %rdx, %rsi; cmpq %rdi, %r8; ja .Lloop_fwd_0x1; jmp .Lend_loop_fwd; + + .p2align 6 +.Llarge_loop_fwd_start: +.Llarge_loop_fwd_0x0: + movaps 16(%rsi), %xmm1 + movaps 32(%rsi), %xmm2 + movaps 48(%rsi), %xmm3 + movaps 64(%rsi), %xmm4 + movaps 80(%rsi), %xmm5 + movntps %xmm1, 16(%rdi) + movntps %xmm2, 32(%rdi) + movntps %xmm3, 48(%rdi) + movntps %xmm4, 64(%rdi) + movntps %xmm5, 80(%rdi) + addq $80, %rdi + addq $80, %rsi + cmpq %rdi, %rcx + ja .Llarge_loop_fwd_0x0 + + .p2align 4 +.Lend_large_loop_fwd: sfence - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .p2align 4 -.Lll_cache_copy_bwd_start: - prefetcht0 -0x1c0(%rsi) - prefetcht0 -0x200(%rsi) - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - movdqu -0x50(%rsi), %xmm4 - movdqu -0x60(%rsi), %xmm5 - movdqu -0x70(%rsi), %xmm6 - movdqu -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - - sub $0x80, %rdx - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - movaps %xmm4, -0x50(%rdi) - movaps %xmm5, -0x60(%rdi) - movaps %xmm6, -0x70(%rdi) - movaps %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - jae .Lll_cache_copy_bwd_start - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl .Llarge_page_ll_less_bwd_64bytes - - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - lea -0x40(%rsi), %rsi - - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - lea -0x40(%rdi), %rdi - sub $0x40, %rdx -.Llarge_page_ll_less_bwd_64bytes: - lea .Ltable_less_80bytes(%rip), %r11; movslq (%r11, %rdx, 4), %rdx; lea (%r11, %rdx), %rdx; jmp *%rdx; ud2 - - .section .rodata.ssse3 - .p2align 3 -.Ltable_less_80bytes: - .int .Lwrite_0bytes - .Ltable_less_80bytes - .int .Lwrite_1bytes - .Ltable_less_80bytes - .int .Lwrite_2bytes - .Ltable_less_80bytes - .int .Lwrite_3bytes - .Ltable_less_80bytes - .int .Lwrite_4bytes - .Ltable_less_80bytes - .int .Lwrite_5bytes - .Ltable_less_80bytes - .int .Lwrite_6bytes - .Ltable_less_80bytes - .int .Lwrite_7bytes - .Ltable_less_80bytes - .int .Lwrite_8bytes - .Ltable_less_80bytes - .int .Lwrite_9bytes - .Ltable_less_80bytes - .int .Lwrite_10bytes - .Ltable_less_80bytes - .int .Lwrite_11bytes - .Ltable_less_80bytes - .int .Lwrite_12bytes - .Ltable_less_80bytes - .int .Lwrite_13bytes - .Ltable_less_80bytes - .int .Lwrite_14bytes - .Ltable_less_80bytes - .int .Lwrite_15bytes - .Ltable_less_80bytes - .int .Lwrite_16bytes - .Ltable_less_80bytes - .int .Lwrite_17bytes - .Ltable_less_80bytes - .int .Lwrite_18bytes - .Ltable_less_80bytes - .int .Lwrite_19bytes - .Ltable_less_80bytes - .int .Lwrite_20bytes - .Ltable_less_80bytes - .int .Lwrite_21bytes - .Ltable_less_80bytes - .int .Lwrite_22bytes - .Ltable_less_80bytes - .int .Lwrite_23bytes - .Ltable_less_80bytes - .int .Lwrite_24bytes - .Ltable_less_80bytes - .int .Lwrite_25bytes - .Ltable_less_80bytes - .int .Lwrite_26bytes - .Ltable_less_80bytes - .int .Lwrite_27bytes - .Ltable_less_80bytes - .int .Lwrite_28bytes - .Ltable_less_80bytes - .int .Lwrite_29bytes - .Ltable_less_80bytes - .int .Lwrite_30bytes - .Ltable_less_80bytes - .int .Lwrite_31bytes - .Ltable_less_80bytes - .int .Lwrite_32bytes - .Ltable_less_80bytes - .int .Lwrite_33bytes - .Ltable_less_80bytes - .int .Lwrite_34bytes - .Ltable_less_80bytes - .int .Lwrite_35bytes - .Ltable_less_80bytes - .int .Lwrite_36bytes - .Ltable_less_80bytes - .int .Lwrite_37bytes - .Ltable_less_80bytes - .int .Lwrite_38bytes - .Ltable_less_80bytes - .int .Lwrite_39bytes - .Ltable_less_80bytes - .int .Lwrite_40bytes - .Ltable_less_80bytes - .int .Lwrite_41bytes - .Ltable_less_80bytes - .int .Lwrite_42bytes - .Ltable_less_80bytes - .int .Lwrite_43bytes - .Ltable_less_80bytes - .int .Lwrite_44bytes - .Ltable_less_80bytes - .int .Lwrite_45bytes - .Ltable_less_80bytes - .int .Lwrite_46bytes - .Ltable_less_80bytes - .int .Lwrite_47bytes - .Ltable_less_80bytes - .int .Lwrite_48bytes - .Ltable_less_80bytes - .int .Lwrite_49bytes - .Ltable_less_80bytes - .int .Lwrite_50bytes - .Ltable_less_80bytes - .int .Lwrite_51bytes - .Ltable_less_80bytes - .int .Lwrite_52bytes - .Ltable_less_80bytes - .int .Lwrite_53bytes - .Ltable_less_80bytes - .int .Lwrite_54bytes - .Ltable_less_80bytes - .int .Lwrite_55bytes - .Ltable_less_80bytes - .int .Lwrite_56bytes - .Ltable_less_80bytes - .int .Lwrite_57bytes - .Ltable_less_80bytes - .int .Lwrite_58bytes - .Ltable_less_80bytes - .int .Lwrite_59bytes - .Ltable_less_80bytes - .int .Lwrite_60bytes - .Ltable_less_80bytes - .int .Lwrite_61bytes - .Ltable_less_80bytes - .int .Lwrite_62bytes - .Ltable_less_80bytes - .int .Lwrite_63bytes - .Ltable_less_80bytes - .int .Lwrite_64bytes - .Ltable_less_80bytes - .int .Lwrite_65bytes - .Ltable_less_80bytes - .int .Lwrite_66bytes - .Ltable_less_80bytes - .int .Lwrite_67bytes - .Ltable_less_80bytes - .int .Lwrite_68bytes - .Ltable_less_80bytes - .int .Lwrite_69bytes - .Ltable_less_80bytes - .int .Lwrite_70bytes - .Ltable_less_80bytes - .int .Lwrite_71bytes - .Ltable_less_80bytes - .int .Lwrite_72bytes - .Ltable_less_80bytes - .int .Lwrite_73bytes - .Ltable_less_80bytes - .int .Lwrite_74bytes - .Ltable_less_80bytes - .int .Lwrite_75bytes - .Ltable_less_80bytes - .int .Lwrite_76bytes - .Ltable_less_80bytes - .int .Lwrite_77bytes - .Ltable_less_80bytes - .int .Lwrite_78bytes - .Ltable_less_80bytes - .int .Lwrite_79bytes - .Ltable_less_80bytes - - .p2align 3 -.Lshl_table: - .int .Lshl_0 - .Lshl_table - .int .Lshl_1 - .Lshl_table - .int .Lshl_2 - .Lshl_table - .int .Lshl_3 - .Lshl_table - .int .Lshl_4 - .Lshl_table - .int .Lshl_5 - .Lshl_table - .int .Lshl_6 - .Lshl_table - .int .Lshl_7 - .Lshl_table - .int .Lshl_8 - .Lshl_table - .int .Lshl_9 - .Lshl_table - .int .Lshl_10 - .Lshl_table - .int .Lshl_11 - .Lshl_table - .int .Lshl_12 - .Lshl_table - .int .Lshl_13 - .Lshl_table - .int .Lshl_14 - .Lshl_table - .int .Lshl_15 - .Lshl_table - - .p2align 3 -.Lshl_table_bwd: - .int .Lshl_0_bwd - .Lshl_table_bwd - .int .Lshl_1_bwd - .Lshl_table_bwd - .int .Lshl_2_bwd - .Lshl_table_bwd - .int .Lshl_3_bwd - .Lshl_table_bwd - .int .Lshl_4_bwd - .Lshl_table_bwd - .int .Lshl_5_bwd - .Lshl_table_bwd - .int .Lshl_6_bwd - .Lshl_table_bwd - .int .Lshl_7_bwd - .Lshl_table_bwd - .int .Lshl_8_bwd - .Lshl_table_bwd - .int .Lshl_9_bwd - .Lshl_table_bwd - .int .Lshl_10_bwd - .Lshl_table_bwd - .int .Lshl_11_bwd - .Lshl_table_bwd - .int .Lshl_12_bwd - .Lshl_table_bwd - .int .Lshl_13_bwd - .Lshl_table_bwd - .int .Lshl_14_bwd - .Lshl_table_bwd - .int .Lshl_15_bwd - .Lshl_table_bwd + movups %xmm11, 16(%rcx) + movups %xmm10, 32(%rcx) + movups %xmm9, 48(%rcx) + movups %xmm8, 64(%rcx) + movups %xmm7, 80(%rcx) + ret + ; .p2align 5; .Llarge_loop_fwd_0xf: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps 64(%rsi), %xmm4; movaps 80(%rsi), %xmm5; movaps %xmm5, %xmm6; palignr $0xf, %xmm4, %xmm5; palignr $0xf, %xmm3, %xmm4; palignr $0xf, %xmm2, %xmm3; palignr $0xf, %xmm0, %xmm2; palignr $0xf, %xmm1, %xmm0; movaps %xmm6, %xmm1; movntps %xmm0, 16(%rdi); movntps %xmm2, 32(%rdi); movntps %xmm3, 48(%rdi); movntps %xmm4, 64(%rdi); movntps %xmm5, 80(%rdi); addq $80, %rdi; addq $80, %rsi; cmpq %rdi, %rcx; ja .Llarge_loop_fwd_0xf; jmp .Lend_large_loop_fwd; + ; .p2align 5; .Llarge_loop_fwd_0xe: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps 64(%rsi), %xmm4; movaps 80(%rsi), %xmm5; movaps %xmm5, %xmm6; palignr $0xe, %xmm4, %xmm5; palignr $0xe, %xmm3, %xmm4; palignr $0xe, %xmm2, %xmm3; palignr $0xe, %xmm0, %xmm2; palignr $0xe, %xmm1, %xmm0; movaps %xmm6, %xmm1; movntps %xmm0, 16(%rdi); movntps %xmm2, 32(%rdi); movntps %xmm3, 48(%rdi); movntps %xmm4, 64(%rdi); movntps %xmm5, 80(%rdi); addq $80, %rdi; addq $80, %rsi; cmpq %rdi, %rcx; ja .Llarge_loop_fwd_0xe; jmp .Lend_large_loop_fwd; + ; .p2align 5; .Llarge_loop_fwd_0xd: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps 64(%rsi), %xmm4; movaps 80(%rsi), %xmm5; movaps %xmm5, %xmm6; palignr $0xd, %xmm4, %xmm5; palignr $0xd, %xmm3, %xmm4; palignr $0xd, %xmm2, %xmm3; palignr $0xd, %xmm0, %xmm2; palignr $0xd, %xmm1, %xmm0; movaps %xmm6, %xmm1; movntps %xmm0, 16(%rdi); movntps %xmm2, 32(%rdi); movntps %xmm3, 48(%rdi); movntps %xmm4, 64(%rdi); movntps %xmm5, 80(%rdi); addq $80, %rdi; addq $80, %rsi; cmpq %rdi, %rcx; ja .Llarge_loop_fwd_0xd; jmp .Lend_large_loop_fwd; + ; .p2align 5; .Llarge_loop_fwd_0xc: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps 64(%rsi), %xmm4; movaps 80(%rsi), %xmm5; movaps %xmm5, %xmm6; palignr $0xc, %xmm4, %xmm5; palignr $0xc, %xmm3, %xmm4; palignr $0xc, %xmm2, %xmm3; palignr $0xc, %xmm0, %xmm2; palignr $0xc, %xmm1, %xmm0; movaps %xmm6, %xmm1; movntps %xmm0, 16(%rdi); movntps %xmm2, 32(%rdi); movntps %xmm3, 48(%rdi); movntps %xmm4, 64(%rdi); movntps %xmm5, 80(%rdi); addq $80, %rdi; addq $80, %rsi; cmpq %rdi, %rcx; ja .Llarge_loop_fwd_0xc; jmp .Lend_large_loop_fwd; + ; .p2align 5; .Llarge_loop_fwd_0xb: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps 64(%rsi), %xmm4; movaps 80(%rsi), %xmm5; movaps %xmm5, %xmm6; palignr $0xb, %xmm4, %xmm5; palignr $0xb, %xmm3, %xmm4; palignr $0xb, %xmm2, %xmm3; palignr $0xb, %xmm0, %xmm2; palignr $0xb, %xmm1, %xmm0; movaps %xmm6, %xmm1; movntps %xmm0, 16(%rdi); movntps %xmm2, 32(%rdi); movntps %xmm3, 48(%rdi); movntps %xmm4, 64(%rdi); movntps %xmm5, 80(%rdi); addq $80, %rdi; addq $80, %rsi; cmpq %rdi, %rcx; ja .Llarge_loop_fwd_0xb; jmp .Lend_large_loop_fwd; + ; .p2align 5; .Llarge_loop_fwd_0xa: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps 64(%rsi), %xmm4; movaps 80(%rsi), %xmm5; movaps %xmm5, %xmm6; palignr $0xa, %xmm4, %xmm5; palignr $0xa, %xmm3, %xmm4; palignr $0xa, %xmm2, %xmm3; palignr $0xa, %xmm0, %xmm2; palignr $0xa, %xmm1, %xmm0; movaps %xmm6, %xmm1; movntps %xmm0, 16(%rdi); movntps %xmm2, 32(%rdi); movntps %xmm3, 48(%rdi); movntps %xmm4, 64(%rdi); movntps %xmm5, 80(%rdi); addq $80, %rdi; addq $80, %rsi; cmpq %rdi, %rcx; ja .Llarge_loop_fwd_0xa; jmp .Lend_large_loop_fwd; + ; .p2align 5; .Llarge_loop_fwd_0x9: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps 64(%rsi), %xmm4; movaps 80(%rsi), %xmm5; movaps %xmm5, %xmm6; palignr $0x9, %xmm4, %xmm5; palignr $0x9, %xmm3, %xmm4; palignr $0x9, %xmm2, %xmm3; palignr $0x9, %xmm0, %xmm2; palignr $0x9, %xmm1, %xmm0; movaps %xmm6, %xmm1; movntps %xmm0, 16(%rdi); movntps %xmm2, 32(%rdi); movntps %xmm3, 48(%rdi); movntps %xmm4, 64(%rdi); movntps %xmm5, 80(%rdi); addq $80, %rdi; addq $80, %rsi; cmpq %rdi, %rcx; ja .Llarge_loop_fwd_0x9; jmp .Lend_large_loop_fwd; + ; .p2align 5; .Llarge_loop_fwd_0x8: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps 64(%rsi), %xmm4; movaps 80(%rsi), %xmm5; movaps %xmm5, %xmm6; palignr $0x8, %xmm4, %xmm5; palignr $0x8, %xmm3, %xmm4; palignr $0x8, %xmm2, %xmm3; palignr $0x8, %xmm0, %xmm2; palignr $0x8, %xmm1, %xmm0; movaps %xmm6, %xmm1; movntps %xmm0, 16(%rdi); movntps %xmm2, 32(%rdi); movntps %xmm3, 48(%rdi); movntps %xmm4, 64(%rdi); movntps %xmm5, 80(%rdi); addq $80, %rdi; addq $80, %rsi; cmpq %rdi, %rcx; ja .Llarge_loop_fwd_0x8; jmp .Lend_large_loop_fwd; + ; .p2align 5; .Llarge_loop_fwd_0x7: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps 64(%rsi), %xmm4; movaps 80(%rsi), %xmm5; movaps %xmm5, %xmm6; palignr $0x7, %xmm4, %xmm5; palignr $0x7, %xmm3, %xmm4; palignr $0x7, %xmm2, %xmm3; palignr $0x7, %xmm0, %xmm2; palignr $0x7, %xmm1, %xmm0; movaps %xmm6, %xmm1; movntps %xmm0, 16(%rdi); movntps %xmm2, 32(%rdi); movntps %xmm3, 48(%rdi); movntps %xmm4, 64(%rdi); movntps %xmm5, 80(%rdi); addq $80, %rdi; addq $80, %rsi; cmpq %rdi, %rcx; ja .Llarge_loop_fwd_0x7; jmp .Lend_large_loop_fwd; + ; .p2align 5; .Llarge_loop_fwd_0x6: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps 64(%rsi), %xmm4; movaps 80(%rsi), %xmm5; movaps %xmm5, %xmm6; palignr $0x6, %xmm4, %xmm5; palignr $0x6, %xmm3, %xmm4; palignr $0x6, %xmm2, %xmm3; palignr $0x6, %xmm0, %xmm2; palignr $0x6, %xmm1, %xmm0; movaps %xmm6, %xmm1; movntps %xmm0, 16(%rdi); movntps %xmm2, 32(%rdi); movntps %xmm3, 48(%rdi); movntps %xmm4, 64(%rdi); movntps %xmm5, 80(%rdi); addq $80, %rdi; addq $80, %rsi; cmpq %rdi, %rcx; ja .Llarge_loop_fwd_0x6; jmp .Lend_large_loop_fwd; + ; .p2align 5; .Llarge_loop_fwd_0x5: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps 64(%rsi), %xmm4; movaps 80(%rsi), %xmm5; movaps %xmm5, %xmm6; palignr $0x5, %xmm4, %xmm5; palignr $0x5, %xmm3, %xmm4; palignr $0x5, %xmm2, %xmm3; palignr $0x5, %xmm0, %xmm2; palignr $0x5, %xmm1, %xmm0; movaps %xmm6, %xmm1; movntps %xmm0, 16(%rdi); movntps %xmm2, 32(%rdi); movntps %xmm3, 48(%rdi); movntps %xmm4, 64(%rdi); movntps %xmm5, 80(%rdi); addq $80, %rdi; addq $80, %rsi; cmpq %rdi, %rcx; ja .Llarge_loop_fwd_0x5; jmp .Lend_large_loop_fwd; + ; .p2align 5; .Llarge_loop_fwd_0x4: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps 64(%rsi), %xmm4; movaps 80(%rsi), %xmm5; movaps %xmm5, %xmm6; palignr $0x4, %xmm4, %xmm5; palignr $0x4, %xmm3, %xmm4; palignr $0x4, %xmm2, %xmm3; palignr $0x4, %xmm0, %xmm2; palignr $0x4, %xmm1, %xmm0; movaps %xmm6, %xmm1; movntps %xmm0, 16(%rdi); movntps %xmm2, 32(%rdi); movntps %xmm3, 48(%rdi); movntps %xmm4, 64(%rdi); movntps %xmm5, 80(%rdi); addq $80, %rdi; addq $80, %rsi; cmpq %rdi, %rcx; ja .Llarge_loop_fwd_0x4; jmp .Lend_large_loop_fwd; + ; .p2align 5; .Llarge_loop_fwd_0x3: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps 64(%rsi), %xmm4; movaps 80(%rsi), %xmm5; movaps %xmm5, %xmm6; palignr $0x3, %xmm4, %xmm5; palignr $0x3, %xmm3, %xmm4; palignr $0x3, %xmm2, %xmm3; palignr $0x3, %xmm0, %xmm2; palignr $0x3, %xmm1, %xmm0; movaps %xmm6, %xmm1; movntps %xmm0, 16(%rdi); movntps %xmm2, 32(%rdi); movntps %xmm3, 48(%rdi); movntps %xmm4, 64(%rdi); movntps %xmm5, 80(%rdi); addq $80, %rdi; addq $80, %rsi; cmpq %rdi, %rcx; ja .Llarge_loop_fwd_0x3; jmp .Lend_large_loop_fwd; + ; .p2align 5; .Llarge_loop_fwd_0x2: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps 64(%rsi), %xmm4; movaps 80(%rsi), %xmm5; movaps %xmm5, %xmm6; palignr $0x2, %xmm4, %xmm5; palignr $0x2, %xmm3, %xmm4; palignr $0x2, %xmm2, %xmm3; palignr $0x2, %xmm0, %xmm2; palignr $0x2, %xmm1, %xmm0; movaps %xmm6, %xmm1; movntps %xmm0, 16(%rdi); movntps %xmm2, 32(%rdi); movntps %xmm3, 48(%rdi); movntps %xmm4, 64(%rdi); movntps %xmm5, 80(%rdi); addq $80, %rdi; addq $80, %rsi; cmpq %rdi, %rcx; ja .Llarge_loop_fwd_0x2; jmp .Lend_large_loop_fwd; + ; .p2align 5; .Llarge_loop_fwd_0x1: movaps 16(%rsi), %xmm0; movaps 32(%rsi), %xmm2; movaps 48(%rsi), %xmm3; movaps 64(%rsi), %xmm4; movaps 80(%rsi), %xmm5; movaps %xmm5, %xmm6; palignr $0x1, %xmm4, %xmm5; palignr $0x1, %xmm3, %xmm4; palignr $0x1, %xmm2, %xmm3; palignr $0x1, %xmm0, %xmm2; palignr $0x1, %xmm1, %xmm0; movaps %xmm6, %xmm1; movntps %xmm0, 16(%rdi); movntps %xmm2, 32(%rdi); movntps %xmm3, 48(%rdi); movntps %xmm4, 64(%rdi); movntps %xmm5, 80(%rdi); addq $80, %rdi; addq $80, %rsi; cmpq %rdi, %rcx; ja .Llarge_loop_fwd_0x1; jmp .Lend_large_loop_fwd; + + .p2align 6 +.Lloop_bkwd_start: +.Lloop_bkwd_0x0: + movaps 32(%rsi), %xmm1 + movaps 16(%rsi), %xmm2 + movaps 0(%rsi), %xmm3 + movaps %xmm1, 32(%rdi) + movaps %xmm2, 16(%rdi) + movaps %xmm3, 0(%rdi) + subq $48, %rdi + subq $48, %rsi + cmpq %rdi, %r8 + jb .Lloop_bkwd_0x0 +.Lend_loop_bkwd: + movups %xmm7, -16(%r8, %rdx) + movups %xmm0, 0(%r8) + movups %xmm4, 16(%r8) + movups %xmm5, 32(%r8) + + ret + ; .p2align 6; .Lloop_bkwd_0xf: movaps 32(%rsi), %xmm1; movaps 16(%rsi), %xmm2; movaps 0(%rsi), %xmm3; palignr $0xf, %xmm1, %xmm6; palignr $0xf, %xmm2, %xmm1; palignr $0xf, %xmm3, %xmm2; movaps %xmm6, 32(%rdi); movaps %xmm1, 16(%rdi); movaps %xmm2, 0(%rdi); subq $48, %rdi; subq $48, %rsi; movaps %xmm3, %xmm6; cmpq %rdi, %r8; jb .Lloop_bkwd_0xf; jmp .Lend_loop_bkwd; + ; .p2align 6; .Lloop_bkwd_0xe: movaps 32(%rsi), %xmm1; movaps 16(%rsi), %xmm2; movaps 0(%rsi), %xmm3; palignr $0xe, %xmm1, %xmm6; palignr $0xe, %xmm2, %xmm1; palignr $0xe, %xmm3, %xmm2; movaps %xmm6, 32(%rdi); movaps %xmm1, 16(%rdi); movaps %xmm2, 0(%rdi); subq $48, %rdi; subq $48, %rsi; movaps %xmm3, %xmm6; cmpq %rdi, %r8; jb .Lloop_bkwd_0xe; jmp .Lend_loop_bkwd; + ; .p2align 6; .Lloop_bkwd_0xd: movaps 32(%rsi), %xmm1; movaps 16(%rsi), %xmm2; movaps 0(%rsi), %xmm3; palignr $0xd, %xmm1, %xmm6; palignr $0xd, %xmm2, %xmm1; palignr $0xd, %xmm3, %xmm2; movaps %xmm6, 32(%rdi); movaps %xmm1, 16(%rdi); movaps %xmm2, 0(%rdi); subq $48, %rdi; subq $48, %rsi; movaps %xmm3, %xmm6; cmpq %rdi, %r8; jb .Lloop_bkwd_0xd; jmp .Lend_loop_bkwd; + ; .p2align 6; .Lloop_bkwd_0xc: movaps 32(%rsi), %xmm1; movaps 16(%rsi), %xmm2; movaps 0(%rsi), %xmm3; palignr $0xc, %xmm1, %xmm6; palignr $0xc, %xmm2, %xmm1; palignr $0xc, %xmm3, %xmm2; movaps %xmm6, 32(%rdi); movaps %xmm1, 16(%rdi); movaps %xmm2, 0(%rdi); subq $48, %rdi; subq $48, %rsi; movaps %xmm3, %xmm6; cmpq %rdi, %r8; jb .Lloop_bkwd_0xc; jmp .Lend_loop_bkwd; + ; .p2align 6; .Lloop_bkwd_0xb: movaps 32(%rsi), %xmm1; movaps 16(%rsi), %xmm2; movaps 0(%rsi), %xmm3; palignr $0xb, %xmm1, %xmm6; palignr $0xb, %xmm2, %xmm1; palignr $0xb, %xmm3, %xmm2; movaps %xmm6, 32(%rdi); movaps %xmm1, 16(%rdi); movaps %xmm2, 0(%rdi); subq $48, %rdi; subq $48, %rsi; movaps %xmm3, %xmm6; cmpq %rdi, %r8; jb .Lloop_bkwd_0xb; jmp .Lend_loop_bkwd; + ; .p2align 6; .Lloop_bkwd_0xa: movaps 32(%rsi), %xmm1; movaps 16(%rsi), %xmm2; movaps 0(%rsi), %xmm3; palignr $0xa, %xmm1, %xmm6; palignr $0xa, %xmm2, %xmm1; palignr $0xa, %xmm3, %xmm2; movaps %xmm6, 32(%rdi); movaps %xmm1, 16(%rdi); movaps %xmm2, 0(%rdi); subq $48, %rdi; subq $48, %rsi; movaps %xmm3, %xmm6; cmpq %rdi, %r8; jb .Lloop_bkwd_0xa; jmp .Lend_loop_bkwd; + ; .p2align 6; .Lloop_bkwd_0x9: movaps 32(%rsi), %xmm1; movaps 16(%rsi), %xmm2; movaps 0(%rsi), %xmm3; palignr $0x9, %xmm1, %xmm6; palignr $0x9, %xmm2, %xmm1; palignr $0x9, %xmm3, %xmm2; movaps %xmm6, 32(%rdi); movaps %xmm1, 16(%rdi); movaps %xmm2, 0(%rdi); subq $48, %rdi; subq $48, %rsi; movaps %xmm3, %xmm6; cmpq %rdi, %r8; jb .Lloop_bkwd_0x9; jmp .Lend_loop_bkwd; + ; .p2align 6; .Lloop_bkwd_0x8: movaps 32(%rsi), %xmm1; movaps 16(%rsi), %xmm2; movaps 0(%rsi), %xmm3; palignr $0x8, %xmm1, %xmm6; palignr $0x8, %xmm2, %xmm1; palignr $0x8, %xmm3, %xmm2; movaps %xmm6, 32(%rdi); movaps %xmm1, 16(%rdi); movaps %xmm2, 0(%rdi); subq $48, %rdi; subq $48, %rsi; movaps %xmm3, %xmm6; cmpq %rdi, %r8; jb .Lloop_bkwd_0x8; jmp .Lend_loop_bkwd; + ; .p2align 6; .Lloop_bkwd_0x7: movaps 32(%rsi), %xmm1; movaps 16(%rsi), %xmm2; movaps 0(%rsi), %xmm3; palignr $0x7, %xmm1, %xmm6; palignr $0x7, %xmm2, %xmm1; palignr $0x7, %xmm3, %xmm2; movaps %xmm6, 32(%rdi); movaps %xmm1, 16(%rdi); movaps %xmm2, 0(%rdi); subq $48, %rdi; subq $48, %rsi; movaps %xmm3, %xmm6; cmpq %rdi, %r8; jb .Lloop_bkwd_0x7; jmp .Lend_loop_bkwd; + ; .p2align 6; .Lloop_bkwd_0x6: movaps 32(%rsi), %xmm1; movaps 16(%rsi), %xmm2; movaps 0(%rsi), %xmm3; palignr $0x6, %xmm1, %xmm6; palignr $0x6, %xmm2, %xmm1; palignr $0x6, %xmm3, %xmm2; movaps %xmm6, 32(%rdi); movaps %xmm1, 16(%rdi); movaps %xmm2, 0(%rdi); subq $48, %rdi; subq $48, %rsi; movaps %xmm3, %xmm6; cmpq %rdi, %r8; jb .Lloop_bkwd_0x6; jmp .Lend_loop_bkwd; + ; .p2align 6; .Lloop_bkwd_0x5: movaps 32(%rsi), %xmm1; movaps 16(%rsi), %xmm2; movaps 0(%rsi), %xmm3; palignr $0x5, %xmm1, %xmm6; palignr $0x5, %xmm2, %xmm1; palignr $0x5, %xmm3, %xmm2; movaps %xmm6, 32(%rdi); movaps %xmm1, 16(%rdi); movaps %xmm2, 0(%rdi); subq $48, %rdi; subq $48, %rsi; movaps %xmm3, %xmm6; cmpq %rdi, %r8; jb .Lloop_bkwd_0x5; jmp .Lend_loop_bkwd; + ; .p2align 6; .Lloop_bkwd_0x4: movaps 32(%rsi), %xmm1; movaps 16(%rsi), %xmm2; movaps 0(%rsi), %xmm3; palignr $0x4, %xmm1, %xmm6; palignr $0x4, %xmm2, %xmm1; palignr $0x4, %xmm3, %xmm2; movaps %xmm6, 32(%rdi); movaps %xmm1, 16(%rdi); movaps %xmm2, 0(%rdi); subq $48, %rdi; subq $48, %rsi; movaps %xmm3, %xmm6; cmpq %rdi, %r8; jb .Lloop_bkwd_0x4; jmp .Lend_loop_bkwd; + ; .p2align 6; .Lloop_bkwd_0x3: movaps 32(%rsi), %xmm1; movaps 16(%rsi), %xmm2; movaps 0(%rsi), %xmm3; palignr $0x3, %xmm1, %xmm6; palignr $0x3, %xmm2, %xmm1; palignr $0x3, %xmm3, %xmm2; movaps %xmm6, 32(%rdi); movaps %xmm1, 16(%rdi); movaps %xmm2, 0(%rdi); subq $48, %rdi; subq $48, %rsi; movaps %xmm3, %xmm6; cmpq %rdi, %r8; jb .Lloop_bkwd_0x3; jmp .Lend_loop_bkwd; + ; .p2align 6; .Lloop_bkwd_0x2: movaps 32(%rsi), %xmm1; movaps 16(%rsi), %xmm2; movaps 0(%rsi), %xmm3; palignr $0x2, %xmm1, %xmm6; palignr $0x2, %xmm2, %xmm1; palignr $0x2, %xmm3, %xmm2; movaps %xmm6, 32(%rdi); movaps %xmm1, 16(%rdi); movaps %xmm2, 0(%rdi); subq $48, %rdi; subq $48, %rsi; movaps %xmm3, %xmm6; cmpq %rdi, %r8; jb .Lloop_bkwd_0x2; jmp .Lend_loop_bkwd; + ; .p2align 6; .Lloop_bkwd_0x1: movaps 32(%rsi), %xmm1; movaps 16(%rsi), %xmm2; movaps 0(%rsi), %xmm3; palignr $0x1, %xmm1, %xmm6; palignr $0x1, %xmm2, %xmm1; palignr $0x1, %xmm3, %xmm2; movaps %xmm6, 32(%rdi); movaps %xmm1, 16(%rdi); movaps %xmm2, 0(%rdi); subq $48, %rdi; subq $48, %rsi; movaps %xmm3, %xmm6; cmpq %rdi, %r8; jb .Lloop_bkwd_0x1; jmp .Lend_loop_bkwd; + +.globl __memcpy_ssse3 +.set __memcpy_ssse3, __memmove_ssse3 diff --git a/include/libmemcpy.h b/include/libmemcpy.h index 8c566e5..d732e2c 100644 --- a/include/libmemcpy.h +++ b/include/libmemcpy.h @@ -45,7 +45,6 @@ typedef void *memcpy_t(void *dst, const void *src, size_t len); #define memcpy_sse2_unaligned __memcpy_sse2_unaligned #define memcpy_sse2_unaligned_erms __memcpy_sse2_unaligned_erms #define memcpy_ssse3 __memcpy_ssse3 -#define memcpy_ssse3_back __memcpy_ssse3_back #define memmove_avx512_no_vzeroupper __memmove_avx512_no_vzeroupper #define memmove_avx512_unaligned __memmove_avx512_unaligned #define memmove_avx512_unaligned_erms __memmove_avx512_unaligned_erms @@ -59,7 +58,6 @@ typedef void *memcpy_t(void *dst, const void *src, size_t len); #define memmove_sse2_unaligned __memmove_sse2_unaligned #define memmove_sse2_unaligned_erms __memmove_sse2_unaligned_erms #define memmove_ssse3 __memmove_ssse3 -#define memmove_ssse3_back __memmove_ssse3_back #define mempcpy_avx512_no_vzeroupper __mempcpy_avx512_no_vzeroupper #define mempcpy_avx512_unaligned __mempcpy_avx512_unaligned #define mempcpy_avx512_unaligned_erms __mempcpy_avx512_unaligned_erms @@ -73,7 +71,6 @@ typedef void *memcpy_t(void *dst, const void *src, size_t len); #define mempcpy_sse2_unaligned __mempcpy_sse2_unaligned #define mempcpy_sse2_unaligned_erms __mempcpy_sse2_unaligned_erms #define mempcpy_ssse3 __mempcpy_ssse3 -#define mempcpy_ssse3_back __mempcpy_ssse3_back #endif memcpy_t memcpy_avx512_no_vzeroupper; @@ -89,7 +86,6 @@ memcpy_t memcpy_evex_unaligned_erms; memcpy_t memcpy_sse2_unaligned; memcpy_t memcpy_sse2_unaligned_erms; memcpy_t memcpy_ssse3; -memcpy_t memcpy_ssse3_back; memcpy_t memmove_avx512_no_vzeroupper; memcpy_t memmove_avx512_unaligned; memcpy_t memmove_avx512_unaligned_erms; @@ -103,7 +99,6 @@ memcpy_t memmove_evex_unaligned_erms; memcpy_t memmove_sse2_unaligned; memcpy_t memmove_sse2_unaligned_erms; memcpy_t memmove_ssse3; -memcpy_t memmove_ssse3_back; memcpy_t mempcpy_avx512_no_vzeroupper; memcpy_t mempcpy_avx512_unaligned; memcpy_t mempcpy_avx512_unaligned_erms; @@ -117,7 +112,6 @@ memcpy_t mempcpy_evex_unaligned_erms; memcpy_t mempcpy_sse2_unaligned; memcpy_t mempcpy_sse2_unaligned_erms; memcpy_t mempcpy_ssse3; -memcpy_t mempcpy_ssse3_back; extern memcpy_t *memcpy_fast; extern memcpy_t *memmove_fast; diff --git a/src/cpu.c b/src/cpu.c index 499685f..631a6b4 100644 --- a/src/cpu.c +++ b/src/cpu.c @@ -306,9 +306,9 @@ memcpy_t *memcpy_fast; memcpy_t *memmove_fast; memcpy_t *mempcpy_fast; -static memcpy_t *memcpy_available[15]; -static memcpy_t *memmove_available[15]; -static memcpy_t *mempcpy_available[15]; +static memcpy_t *memcpy_available[14]; +static memcpy_t *memmove_available[14]; +static memcpy_t *mempcpy_available[14]; static int memcpy_available_count; static int memmove_available_count; diff --git a/src/mingw-shim.s b/src/mingw-shim.s index 4b206cf..a84f122 100644 --- a/src/mingw-shim.s +++ b/src/mingw-shim.s @@ -151,17 +151,7 @@ memcpy_erms: movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx - subq $72, %rsp - movdqa %xmm6, 0(%rsp) - movdqa %xmm7, 16(%rsp) - movdqa %xmm8, 32(%rsp) - movdqa %xmm12, 48(%rsp) call __memcpy_erms - movdqa 0(%rsp), %xmm6 - movdqa 16(%rsp), %xmm7 - movdqa 32(%rsp), %xmm8 - movdqa 48(%rsp), %xmm12 - addq $72, %rsp popq %rsi popq %rdi ret @@ -241,35 +231,21 @@ memcpy_ssse3: movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx - subq $40, %rsp - movdqa %xmm6, 0(%rsp) - movdqa %xmm7, 16(%rsp) - call __memcpy_ssse3 - movdqa 0(%rsp), %xmm6 - movdqa 16(%rsp), %xmm7 - addq $40, %rsp - popq %rsi - popq %rdi - ret - - .global memcpy_ssse3_back -memcpy_ssse3_back: - pushq %rdi - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi - movq %r8, %rdx - subq $72, %rsp + subq $104, %rsp movdqa %xmm6, 0(%rsp) movdqa %xmm7, 16(%rsp) movdqa %xmm8, 32(%rsp) movdqa %xmm9, 48(%rsp) - call __memcpy_ssse3_back + movdqa %xmm10, 64(%rsp) + movdqa %xmm11, 80(%rsp) + call __memcpy_ssse3 movdqa 0(%rsp), %xmm6 movdqa 16(%rsp), %xmm7 movdqa 32(%rsp), %xmm8 movdqa 48(%rsp), %xmm9 - addq $72, %rsp + movdqa 64(%rsp), %xmm10 + movdqa 80(%rsp), %xmm11 + addq $104, %rsp popq %rsi popq %rdi ret @@ -427,17 +403,7 @@ memmove_erms: movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx - subq $72, %rsp - movdqa %xmm6, 0(%rsp) - movdqa %xmm7, 16(%rsp) - movdqa %xmm8, 32(%rsp) - movdqa %xmm12, 48(%rsp) call __memmove_erms - movdqa 0(%rsp), %xmm6 - movdqa 16(%rsp), %xmm7 - movdqa 32(%rsp), %xmm8 - movdqa 48(%rsp), %xmm12 - addq $72, %rsp popq %rsi popq %rdi ret @@ -517,35 +483,21 @@ memmove_ssse3: movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx - subq $40, %rsp - movdqa %xmm6, 0(%rsp) - movdqa %xmm7, 16(%rsp) - call __memmove_ssse3 - movdqa 0(%rsp), %xmm6 - movdqa 16(%rsp), %xmm7 - addq $40, %rsp - popq %rsi - popq %rdi - ret - - .global memmove_ssse3_back -memmove_ssse3_back: - pushq %rdi - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi - movq %r8, %rdx - subq $72, %rsp + subq $104, %rsp movdqa %xmm6, 0(%rsp) movdqa %xmm7, 16(%rsp) movdqa %xmm8, 32(%rsp) movdqa %xmm9, 48(%rsp) - call __memmove_ssse3_back + movdqa %xmm10, 64(%rsp) + movdqa %xmm11, 80(%rsp) + call __memmove_ssse3 movdqa 0(%rsp), %xmm6 movdqa 16(%rsp), %xmm7 movdqa 32(%rsp), %xmm8 movdqa 48(%rsp), %xmm9 - addq $72, %rsp + movdqa 64(%rsp), %xmm10 + movdqa 80(%rsp), %xmm11 + addq $104, %rsp popq %rsi popq %rdi ret @@ -703,17 +655,7 @@ mempcpy_erms: movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx - subq $72, %rsp - movdqa %xmm6, 0(%rsp) - movdqa %xmm7, 16(%rsp) - movdqa %xmm8, 32(%rsp) - movdqa %xmm12, 48(%rsp) call __mempcpy_erms - movdqa 0(%rsp), %xmm6 - movdqa 16(%rsp), %xmm7 - movdqa 32(%rsp), %xmm8 - movdqa 48(%rsp), %xmm12 - addq $72, %rsp popq %rsi popq %rdi ret @@ -793,35 +735,21 @@ mempcpy_ssse3: movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx - subq $40, %rsp - movdqa %xmm6, 0(%rsp) - movdqa %xmm7, 16(%rsp) - call __mempcpy_ssse3 - movdqa 0(%rsp), %xmm6 - movdqa 16(%rsp), %xmm7 - addq $40, %rsp - popq %rsi - popq %rdi - ret - - .global mempcpy_ssse3_back -mempcpy_ssse3_back: - pushq %rdi - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi - movq %r8, %rdx - subq $72, %rsp + subq $104, %rsp movdqa %xmm6, 0(%rsp) movdqa %xmm7, 16(%rsp) movdqa %xmm8, 32(%rsp) movdqa %xmm9, 48(%rsp) - call __mempcpy_ssse3_back + movdqa %xmm10, 64(%rsp) + movdqa %xmm11, 80(%rsp) + call __mempcpy_ssse3 movdqa 0(%rsp), %xmm6 movdqa 16(%rsp), %xmm7 movdqa 32(%rsp), %xmm8 movdqa 48(%rsp), %xmm9 - addq $72, %rsp + movdqa 64(%rsp), %xmm10 + movdqa 80(%rsp), %xmm11 + addq $104, %rsp popq %rsi popq %rdi ret diff --git a/src/names-memcpy.c b/src/names-memcpy.c index 4a32b4d..254cb81 100644 --- a/src/names-memcpy.c +++ b/src/names-memcpy.c @@ -29,8 +29,6 @@ const char *libmemcpy_memcpy_name(memcpy_t *func) { return "memcpy_sse2_unaligned_erms"; if (func == memcpy_ssse3) return "memcpy_ssse3"; - if (func == memcpy_ssse3_back) - return "memcpy_ssse3_back"; return NULL; } @@ -61,8 +59,6 @@ const char *libmemcpy_memmove_name(memcpy_t *func) { return "memmove_sse2_unaligned_erms"; if (func == memmove_ssse3) return "memmove_ssse3"; - if (func == memmove_ssse3_back) - return "memmove_ssse3_back"; return NULL; } @@ -93,7 +89,5 @@ const char *libmemcpy_mempcpy_name(memcpy_t *func) { return "mempcpy_sse2_unaligned_erms"; if (func == mempcpy_ssse3) return "mempcpy_ssse3"; - if (func == mempcpy_ssse3_back) - return "mempcpy_ssse3_back"; return NULL; }