diff --git a/Cargo.toml b/Cargo.toml index e2835e252b..e095f9efc5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -110,10 +110,12 @@ include = [ "examples/**/*.rs", "include/ring-core/aes.h", "include/ring-core/arm_arch.h", + "include/ring-core/asm_base.h", "include/ring-core/base.h", "include/ring-core/check.h", "include/ring-core/mem.h", "include/ring-core/poly1305.h", + "include/ring-core/target.h", "include/ring-core/type_check.h", "src/**/*.rs", "src/aead/poly1305_test.txt", diff --git a/crypto/curve25519/asm/x25519-asm-arm.S b/crypto/curve25519/asm/x25519-asm-arm.S index 04d0362bdd..8a51bb9bdf 100644 --- a/crypto/curve25519/asm/x25519-asm-arm.S +++ b/crypto/curve25519/asm/x25519-asm-arm.S @@ -17,15 +17,9 @@ * domain licensed but the standard ISC license is included above to keep * licensing simple. */ -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif +#include -#if !defined(OPENSSL_NO_ASM) && defined(__ARMEL__) && defined(__ELF__) - -#include "ring_core_generated/prefix_symbols_asm.h" +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) .fpu neon .text @@ -2127,8 +2121,4 @@ mov sp,r12 vpop {q4,q5,q6,q7} bx lr -#endif /* !OPENSSL_NO_ASM && __ARMEL__ && __ELF__ */ - -#if defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif +#endif /* !OPENSSL_NO_ASM && OPENSSL_ARM && __ELF__ */ diff --git a/crypto/curve25519/curve25519.c b/crypto/curve25519/curve25519.c index e8178e0aea..1ab754d3af 100644 --- a/crypto/curve25519/curve25519.c +++ b/crypto/curve25519/curve25519.c @@ -778,6 +778,18 @@ static void table_select(ge_precomp *t, const int pos, const signed char b) { // Preconditions: // a[31] <= 127 void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) { +#if defined(BORINGSSL_FE25519_ADX) + if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() && + CRYPTO_is_ADX_capable()) { + uint8_t t[4][32]; + x25519_ge_scalarmult_base_adx(t, a); + fiat_25519_from_bytes(h->X.v, t[0]); + fiat_25519_from_bytes(h->Y.v, t[1]); + fiat_25519_from_bytes(h->Z.v, t[2]); + fiat_25519_from_bytes(h->T.v, t[3]); + return; + } +#endif signed char e[64]; signed char carry; ge_p1p1 r; diff --git a/crypto/curve25519/curve25519_64_adx.c b/crypto/curve25519/curve25519_64_adx.c new file mode 100644 index 0000000000..2768989643 --- /dev/null +++ b/crypto/curve25519/curve25519_64_adx.c @@ -0,0 +1,18 @@ +/* Copyright (c) 2023, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#include "internal.h" +#if defined(BORINGSSL_FE25519_ADX) +#include "../../third_party/fiat/curve25519_64_adx.h" +#endif diff --git a/crypto/internal.h b/crypto/internal.h index fe182bd4ce..6a6351710d 100644 --- a/crypto/internal.h +++ b/crypto/internal.h @@ -223,22 +223,6 @@ static inline crypto_word_t value_barrier_w(crypto_word_t a) { return a; } -// value_barrier_u32 behaves like |value_barrier_w| but takes a |uint32_t|. -static inline uint32_t value_barrier_u32(uint32_t a) { -#if defined(__GNUC__) || defined(__clang__) - __asm__("" : "+r"(a) : /* no inputs */); -#endif - return a; -} - -// value_barrier_u64 behaves like |value_barrier_w| but takes a |uint64_t|. -static inline uint64_t value_barrier_u64(uint64_t a) { -#if defined(__GNUC__) || defined(__clang__) - __asm__("" : "+r"(a) : /* no inputs */); -#endif - return a; -} - // |value_barrier_u8| could be defined as above, but compilers other than // clang seem to still materialize 0x00..00MM instead of reusing 0x??..??MM. diff --git a/crypto/perlasm/arm-xlate.pl b/crypto/perlasm/arm-xlate.pl index d15a0cb108..aa2f20f348 100755 --- a/crypto/perlasm/arm-xlate.pl +++ b/crypto/perlasm/arm-xlate.pl @@ -153,9 +153,9 @@ sub expand_line { my ($arch_defines, $target_defines); if ($flavour =~ /32/) { - $arch_defines = "defined(__ARMEL__)"; + $arch_defines = "defined(OPENSSL_ARM)"; } elsif ($flavour =~ /64/) { - $arch_defines = "defined(__AARCH64EL__)"; + $arch_defines = "defined(OPENSSL_AARCH64)"; } else { die "unknown architecture: $flavour"; } @@ -177,18 +177,11 @@ sub expand_line { // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif +#include #if !defined(OPENSSL_NO_ASM) && $arch_defines && $target_defines ___ -print "#include \"ring_core_generated/prefix_symbols_asm.h\"\n"; - while(my $line=<>) { if ($line =~ m/^\s*(#|@|\/\/)/) { print $line; next; } @@ -258,10 +251,6 @@ sub expand_line { print <<___; #endif // !OPENSSL_NO_ASM && $arch_defines && $target_defines -#if defined(__ELF__) -// See https://www.airs.com/blog/archives/518. -.section .note.GNU-stack,"",\%progbits -#endif ___ close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/perlasm/x86_64-xlate.pl b/crypto/perlasm/x86_64-xlate.pl index 508cf4986a..044a379da9 100755 --- a/crypto/perlasm/x86_64-xlate.pl +++ b/crypto/perlasm/x86_64-xlate.pl @@ -1520,14 +1520,9 @@ sub rxb { die "unknown target: $flavour"; } print <<___; -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) && $target -#include "ring_core_generated/prefix_symbols_asm.h" +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && $target ___ } @@ -1623,13 +1618,7 @@ sub process_line { if ($masm) { print "END\n"; } elsif ($gas) { - print <<___; -#endif -#if defined(__ELF__) -// See https://www.airs.com/blog/archives/518. -.section .note.GNU-stack,"",\%progbits -#endif -___ + print "#endif\n"; } elsif ($nasm) { print <<___; \%else diff --git a/crypto/perlasm/x86asm.pl b/crypto/perlasm/x86asm.pl index f0abfb5181..59fc975f24 100644 --- a/crypto/perlasm/x86asm.pl +++ b/crypto/perlasm/x86asm.pl @@ -305,22 +305,13 @@ sub ::asm_finish } print <<___; -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if !defined(OPENSSL_NO_ASM) && defined(__i386__) && $target -#include "ring_core_generated/prefix_symbols_asm.h" +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && $target ___ print @out; print <<___; -#endif // !defined(OPENSSL_NO_ASM) && defined(__i386__) && $target -#if defined(__ELF__) -// See https://www.airs.com/blog/archives/518. -.section .note.GNU-stack,"",\%progbits -#endif +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && $target ___ } } diff --git a/crypto/poly1305/poly1305_arm_asm.S b/crypto/poly1305/poly1305_arm_asm.S index 93f46e8111..df464d068d 100644 --- a/crypto/poly1305/poly1305_arm_asm.S +++ b/crypto/poly1305/poly1305_arm_asm.S @@ -1,15 +1,9 @@ -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif +#include -#if defined(__ARMEL__) && !defined(OPENSSL_NO_ASM) && defined(__ELF__) +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) #pragma GCC diagnostic ignored "-Wlanguage-extension-token" -#include "ring_core_generated/prefix_symbols_asm.h" - # This implementation was taken from the public domain, neon2 version in # SUPERCOP by D. J. Bernstein and Peter Schwabe. @@ -2022,8 +2016,4 @@ vst1.8 d4,[r0,: 64] add sp,sp,#0 bx lr -#endif /* __ARMEL__ && !OPENSSL_NO_ASM && __ELF__ */ - -#if defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif +#endif /* !OPENSSL_NO_ASM && OPENSSL_ARM && __ELF__ */ diff --git a/include/ring-core/arm_arch.h b/include/ring-core/arm_arch.h index 77229ddc87..2fc0fc0421 100644 --- a/include/ring-core/arm_arch.h +++ b/include/ring-core/arm_arch.h @@ -53,12 +53,13 @@ #ifndef OPENSSL_HEADER_ARM_ARCH_H #define OPENSSL_HEADER_ARM_ARCH_H +#include + // arm_arch.h contains symbols used by ARM assembly, and the C code that calls // it. It is included as a public header to simplify the build, but is not // intended for external use. -#if defined(__ARMEL__) || defined(_M_ARM) || defined(__AARCH64EL__) || \ - defined(_M_ARM64) +#if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) // ARMV7_NEON is true when a NEON unit is present in the current CPU. #define ARMV7_NEON (1 << 0) @@ -91,124 +92,8 @@ // will be included. #define __ARM_MAX_ARCH__ 8 -// Support macros for -// - Armv8.3-A Pointer Authentication and -// - Armv8.5-A Branch Target Identification -// features which require emitting a .note.gnu.property section with the -// appropriate architecture-dependent feature bits set. -// -// |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to -// PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be -// used immediately before saving the LR register (x30) to the stack. -// |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring -// it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone -// with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also -// have the same value at the two points. For example: -// -// .global f -// f: -// AARCH64_SIGN_LINK_REGISTER -// stp x29, x30, [sp, #-96]! -// mov x29, sp -// ... -// ldp x29, x30, [sp], #96 -// AARCH64_VALIDATE_LINK_REGISTER -// ret -// -// |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or -// |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an -// indirect call target. In particular, all symbols exported from a file must -// begin with one of these macros. For example, a leaf function that does not -// save LR can instead use |AARCH64_VALID_CALL_TARGET|: -// -// .globl return_zero -// return_zero: -// AARCH64_VALID_CALL_TARGET -// mov x0, #0 -// ret -// -// A non-leaf function which does not immediately save LR may need both macros -// because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function -// may jump to an alternate implementation before setting up the stack: -// -// .globl with_early_jump -// with_early_jump: -// AARCH64_VALID_CALL_TARGET -// cmp x0, #128 -// b.lt .Lwith_early_jump_128 -// AARCH64_SIGN_LINK_REGISTER -// stp x29, x30, [sp, #-96]! -// mov x29, sp -// ... -// ldp x29, x30, [sp], #96 -// AARCH64_VALIDATE_LINK_REGISTER -// ret -// -// .Lwith_early_jump_128: -// ... -// ret -// -// These annotations are only required with indirect calls. Private symbols that -// are only the target of direct calls do not require annotations. Also note -// that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not -// indirect jumps (BR). Indirect jumps in assembly are currently not supported -// and would require a macro for BTI 'j'. -// -// Although not necessary, it is safe to use these macros in 32-bit ARM -// assembly. This may be used to simplify dual 32-bit and 64-bit files. -// -// References: -// - "ELF for the Arm® 64-bit Architecture" -// https://github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst -// - "Providing protection for complex software" -// https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software - -#if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1 -#define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification -#define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c' -#else -#define GNU_PROPERTY_AARCH64_BTI 0 // No Branch Target Identification -#define AARCH64_VALID_CALL_TARGET -#endif - -#if defined(__ARM_FEATURE_PAC_DEFAULT) && \ - (__ARM_FEATURE_PAC_DEFAULT & 1) == 1 // Signed with A-key -#define GNU_PROPERTY_AARCH64_POINTER_AUTH \ - (1 << 1) // Has Pointer Authentication -#define AARCH64_SIGN_LINK_REGISTER hint #25 // PACIASP -#define AARCH64_VALIDATE_LINK_REGISTER hint #29 // AUTIASP -#elif defined(__ARM_FEATURE_PAC_DEFAULT) && \ - (__ARM_FEATURE_PAC_DEFAULT & 2) == 2 // Signed with B-key -#define GNU_PROPERTY_AARCH64_POINTER_AUTH \ - (1 << 1) // Has Pointer Authentication -#define AARCH64_SIGN_LINK_REGISTER hint #27 // PACIBSP -#define AARCH64_VALIDATE_LINK_REGISTER hint #31 // AUTIBSP -#else -#define GNU_PROPERTY_AARCH64_POINTER_AUTH 0 // No Pointer Authentication -#if GNU_PROPERTY_AARCH64_BTI != 0 -#define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET -#else -#define AARCH64_SIGN_LINK_REGISTER -#endif -#define AARCH64_VALIDATE_LINK_REGISTER -#endif - -#if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0 -.pushsection .note.gnu.property, "a"; -.balign 8; -.long 4; -.long 0x10; -.long 0x5; -.asciz "GNU"; -.long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ -.long 4; -.long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI); -.long 0; -.popsection; -#endif - #endif // __ASSEMBLER__ -#endif // __ARMEL__ || _M_ARM || __AARCH64EL__ || _M_ARM64 +#endif // ARM || AARCH64 #endif // OPENSSL_HEADER_ARM_ARCH_H diff --git a/include/ring-core/asm_base.h b/include/ring-core/asm_base.h new file mode 100644 index 0000000000..c905e998c1 --- /dev/null +++ b/include/ring-core/asm_base.h @@ -0,0 +1,186 @@ +/* Copyright (c) 2023, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_ASM_BASE_H +#define OPENSSL_HEADER_ASM_BASE_H + +#include + + +// This header contains symbols and common sections used by assembly files. It +// is included as a public header to simplify the build, but is not intended for +// external use. +// +// Every assembly file must include this header. Some linker features require +// all object files to be tagged with some section metadata. This header file, +// when included in assembly, adds that metadata. It also makes defines like +// |OPENSSL_X86_64| available and includes the prefixing macros. +// +// Including this header in an assembly file imples: +// +// - The file does not require an executable stack. +// +// - The file, on aarch64, uses the macros defined below to be compatible with +// BTI and PAC. + +#if defined(__ASSEMBLER__) + +#include + +#if defined(__ELF__) +// Every ELF object file, even empty ones, should disable executable stacks. See +// https://www.airs.com/blog/archives/518. +.pushsection .note.GNU-stack, "", %progbits +.popsection +#endif + +#if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) + +// We require the ARM assembler provide |__ARM_ARCH| from Arm C Language +// Extensions (ACLE). This is supported in GCC 4.8+ and Clang 3.2+. MSVC does +// not implement ACLE, but we require Clang's assembler on Windows. +#if !defined(__ARM_ARCH) +#error "ARM assembler must define __ARM_ARCH" +#endif + +// __ARM_ARCH__ is used by OpenSSL assembly to determine the minimum target ARM +// version. +// +// TODO(davidben): Switch the assembly to use |__ARM_ARCH| directly. +#define __ARM_ARCH__ __ARM_ARCH + +// Even when building for 32-bit ARM, support for aarch64 crypto instructions +// will be included. +#define __ARM_MAX_ARCH__ 8 + +// Support macros for +// - Armv8.3-A Pointer Authentication and +// - Armv8.5-A Branch Target Identification +// features which require emitting a .note.gnu.property section with the +// appropriate architecture-dependent feature bits set. +// +// |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to +// PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be +// used immediately before saving the LR register (x30) to the stack. +// |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring +// it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone +// with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also +// have the same value at the two points. For example: +// +// .global f +// f: +// AARCH64_SIGN_LINK_REGISTER +// stp x29, x30, [sp, #-96]! +// mov x29, sp +// ... +// ldp x29, x30, [sp], #96 +// AARCH64_VALIDATE_LINK_REGISTER +// ret +// +// |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or +// |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an +// indirect call target. In particular, all symbols exported from a file must +// begin with one of these macros. For example, a leaf function that does not +// save LR can instead use |AARCH64_VALID_CALL_TARGET|: +// +// .globl return_zero +// return_zero: +// AARCH64_VALID_CALL_TARGET +// mov x0, #0 +// ret +// +// A non-leaf function which does not immediately save LR may need both macros +// because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function +// may jump to an alternate implementation before setting up the stack: +// +// .globl with_early_jump +// with_early_jump: +// AARCH64_VALID_CALL_TARGET +// cmp x0, #128 +// b.lt .Lwith_early_jump_128 +// AARCH64_SIGN_LINK_REGISTER +// stp x29, x30, [sp, #-96]! +// mov x29, sp +// ... +// ldp x29, x30, [sp], #96 +// AARCH64_VALIDATE_LINK_REGISTER +// ret +// +// .Lwith_early_jump_128: +// ... +// ret +// +// These annotations are only required with indirect calls. Private symbols that +// are only the target of direct calls do not require annotations. Also note +// that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not +// indirect jumps (BR). Indirect jumps in assembly are currently not supported +// and would require a macro for BTI 'j'. +// +// Although not necessary, it is safe to use these macros in 32-bit ARM +// assembly. This may be used to simplify dual 32-bit and 64-bit files. +// +// References: +// - "ELF for the Arm® 64-bit Architecture" +// https://github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst +// - "Providing protection for complex software" +// https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software + +#if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1 +#define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification +#define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c' +#else +#define GNU_PROPERTY_AARCH64_BTI 0 // No Branch Target Identification +#define AARCH64_VALID_CALL_TARGET +#endif + +#if defined(__ARM_FEATURE_PAC_DEFAULT) && \ + (__ARM_FEATURE_PAC_DEFAULT & 1) == 1 // Signed with A-key +#define GNU_PROPERTY_AARCH64_POINTER_AUTH \ + (1 << 1) // Has Pointer Authentication +#define AARCH64_SIGN_LINK_REGISTER hint #25 // PACIASP +#define AARCH64_VALIDATE_LINK_REGISTER hint #29 // AUTIASP +#elif defined(__ARM_FEATURE_PAC_DEFAULT) && \ + (__ARM_FEATURE_PAC_DEFAULT & 2) == 2 // Signed with B-key +#define GNU_PROPERTY_AARCH64_POINTER_AUTH \ + (1 << 1) // Has Pointer Authentication +#define AARCH64_SIGN_LINK_REGISTER hint #27 // PACIBSP +#define AARCH64_VALIDATE_LINK_REGISTER hint #31 // AUTIBSP +#else +#define GNU_PROPERTY_AARCH64_POINTER_AUTH 0 // No Pointer Authentication +#if GNU_PROPERTY_AARCH64_BTI != 0 +#define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET +#else +#define AARCH64_SIGN_LINK_REGISTER +#endif +#define AARCH64_VALIDATE_LINK_REGISTER +#endif + +#if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0 +.pushsection .note.gnu.property, "a"; +.balign 8; +.long 4; +.long 0x10; +.long 0x5; +.asciz "GNU"; +.long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ +.long 4; +.long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI); +.long 0; +.popsection; +#endif +#endif // ARM || AARCH64 + +#endif // __ASSEMBLER__ + +#endif // OPENSSL_HEADER_ASM_BASE_H diff --git a/include/ring-core/base.h b/include/ring-core/base.h index f1a027d1a4..938c5b8c57 100644 --- a/include/ring-core/base.h +++ b/include/ring-core/base.h @@ -56,10 +56,6 @@ // This file should be the first included by all BoringSSL headers. -#include - -#include - #if defined(_MSC_VER) && !defined(__clang__) #pragma warning(push, 3) #endif @@ -71,40 +67,25 @@ #pragma warning(pop) #endif -#if defined(__x86_64) || defined(_M_AMD64) || defined(_M_X64) -#define OPENSSL_64_BIT -#define OPENSSL_X86_64 -#elif defined(__x86) || defined(__i386) || defined(__i386__) || defined(_M_IX86) -#define OPENSSL_32_BIT -#define OPENSSL_X86 -#elif defined(__AARCH64EL__) || defined(_M_ARM64) -#define OPENSSL_64_BIT -#define OPENSSL_AARCH64 -#elif defined(__ARMEL__) || defined(_M_ARM) -#define OPENSSL_32_BIT -#define OPENSSL_ARM -#elif defined(__MIPSEL__) && !defined(__LP64__) -#define OPENSSL_32_BIT -#define OPENSSL_MIPS -#elif defined(__MIPSEL__) && defined(__LP64__) -#define OPENSSL_64_BIT -#define OPENSSL_MIPS64 -#elif defined(__wasm__) -#define OPENSSL_32_BIT -#else -// Note BoringSSL only supports standard 32-bit and 64-bit two's-complement, -// little-endian architectures. Functions will not produce the correct answer -// on other systems. Run the crypto_test binary, notably -// crypto/compiler_test.cc, before adding a new architecture. -#error "Unknown target CPU" +#if defined(__APPLE__) +#include #endif +#include // IWYU pragma: export + +#include + +#include + #if defined(__APPLE__) -#define OPENSSL_APPLE +// Note |TARGET_OS_MAC| is set for all Apple OS variants. |TARGET_OS_OSX| +// targets macOS specifically. +#if defined(TARGET_OS_OSX) && TARGET_OS_OSX +#define OPENSSL_MACOS +#endif +#if defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE +#define OPENSSL_IOS #endif - -#if defined(_WIN32) -#define OPENSSL_WINDOWS #endif // *ring* doesn't support the `BORINGSSL_SHARED_LIBRARY` configuration, so diff --git a/include/ring-core/target.h b/include/ring-core/target.h new file mode 100644 index 0000000000..0213d5c637 --- /dev/null +++ b/include/ring-core/target.h @@ -0,0 +1,139 @@ +/* Copyright (c) 2023, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_TARGET_H +#define OPENSSL_HEADER_TARGET_H + +// Preprocessor symbols that define the target platform. +// +// This file may be included in C, C++, and assembler and must be compatible +// with each environment. It is separated out only to share code between +// and . Prefer to include those headers +// instead. + +#if defined(__x86_64) || defined(_M_AMD64) || defined(_M_X64) +#define OPENSSL_64_BIT +#define OPENSSL_X86_64 +#elif defined(__x86) || defined(__i386) || defined(__i386__) || defined(_M_IX86) +#define OPENSSL_32_BIT +#define OPENSSL_X86 +#elif defined(__AARCH64EL__) || defined(_M_ARM64) +#define OPENSSL_64_BIT +#define OPENSSL_AARCH64 +#elif defined(__ARMEL__) || defined(_M_ARM) +#define OPENSSL_32_BIT +#define OPENSSL_ARM +#elif defined(__MIPSEL__) && !defined(__LP64__) +#define OPENSSL_32_BIT +#define OPENSSL_MIPS +#elif defined(__MIPSEL__) && defined(__LP64__) +#define OPENSSL_64_BIT +#define OPENSSL_MIPS64 +#elif defined(__wasm__) +#define OPENSSL_32_BIT +#else +// Note BoringSSL only supports standard 32-bit and 64-bit two's-complement, +// little-endian architectures. Functions will not produce the correct answer +// on other systems. Run the crypto_test binary, notably +// crypto/compiler_test.cc, before adding a new architecture. +#error "Unknown target CPU" +#endif + +#if defined(__APPLE__) +#define OPENSSL_APPLE +#endif + +#if defined(_WIN32) +#define OPENSSL_WINDOWS +#endif + +// Trusty isn't Linux but currently defines __linux__. As a workaround, we +// exclude it here. +// TODO(b/169780122): Remove this workaround once Trusty no longer defines it. +#if defined(__linux__) && !defined(__TRUSTY__) +#define OPENSSL_LINUX +#endif + +#if defined(__Fuchsia__) +#define OPENSSL_FUCHSIA +#endif + +#if defined(__TRUSTY__) +#define OPENSSL_TRUSTY +#define OPENSSL_NO_POSIX_IO +#define OPENSSL_NO_SOCK +#define OPENSSL_NO_THREADS_CORRUPT_MEMORY_AND_LEAK_SECRETS_IF_THREADED +#endif + +#if defined(OPENSSL_NANOLIBC) +#define OPENSSL_NO_POSIX_IO +#define OPENSSL_NO_SOCK +#define OPENSSL_NO_THREADS_CORRUPT_MEMORY_AND_LEAK_SECRETS_IF_THREADED +#endif + +#if defined(__ANDROID_API__) +#define OPENSSL_ANDROID +#endif + +#if defined(__FreeBSD__) +#define OPENSSL_FREEBSD +#endif + +#if defined(__OpenBSD__) +#define OPENSSL_OPENBSD +#endif + +// BoringSSL requires platform's locking APIs to make internal global state +// thread-safe, including the PRNG. On some single-threaded embedded platforms, +// locking APIs may not exist, so this dependency may be disabled with the +// following build flag. +// +// IMPORTANT: Doing so means the consumer promises the library will never be +// used in any multi-threaded context. It causes BoringSSL to be globally +// thread-unsafe. Setting it inappropriately will subtly and unpredictably +// corrupt memory and leak secret keys. +// +// Do not set this flag on any platform where threads are possible. BoringSSL +// maintainers will not provide support for any consumers that do so. Changes +// which break such unsupported configurations will not be reverted. +#if !defined(OPENSSL_NO_THREADS_CORRUPT_MEMORY_AND_LEAK_SECRETS_IF_THREADED) +#define OPENSSL_THREADS +#endif + +#if defined(BORINGSSL_UNSAFE_FUZZER_MODE) && \ + !defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE) +#define BORINGSSL_UNSAFE_DETERMINISTIC_MODE +#endif + +#if defined(__has_feature) +#if __has_feature(address_sanitizer) +#define OPENSSL_ASAN +#endif +#if __has_feature(thread_sanitizer) +#define OPENSSL_TSAN +#endif +#if __has_feature(memory_sanitizer) +#define OPENSSL_MSAN +#define OPENSSL_ASM_INCOMPATIBLE +#endif +#endif + +#if defined(OPENSSL_ASM_INCOMPATIBLE) +#undef OPENSSL_ASM_INCOMPATIBLE +#if !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif // OPENSSL_ASM_INCOMPATIBLE + +#endif // OPENSSL_HEADER_TARGET_H diff --git a/include/ring-core/type_check.h b/include/ring-core/type_check.h index d7e0393451..67df7bc687 100644 --- a/include/ring-core/type_check.h +++ b/include/ring-core/type_check.h @@ -71,5 +71,4 @@ #define OPENSSL_STATIC_ASSERT(cond, msg) _Static_assert(cond, msg) #endif - #endif // OPENSSL_HEADER_TYPE_CHECK_H diff --git a/third_party/fiat/README.md b/third_party/fiat/README.md index 56accd45fa..9c1fc870bb 100644 --- a/third_party/fiat/README.md +++ b/third_party/fiat/README.md @@ -1,8 +1,23 @@ -# Fiat +# Fiat Cryptography -This directory contains code generated by -[Fiat](https://github.com/mit-plv/fiat-crypto) and thus these files are -licensed under the MIT license. (See LICENSE file.) +The files in this directory are generated using [Fiat +Cryptography](https://github.com/mit-plv/fiat-crypto) from the associated +library of arithmetic-implementation templates. These files are included under +the MIT license. (See LICENSE file.) -The files are imported from the `fiat-c/src` directory of the Fiat repository. -Their contents are `#include`d into source files, so we rename them to `.h`. +Some files are included directly from the `fiat-c/src` directory of the Fiat +Cryptography repository. Their contents are `#include`d into source files, so +we rename them to `.h`. Implementations that use saturated arithmetic on 64-bit +words are further manually edited to use platform-appropriate incantations for +operations such as addition with carry; these changes are marked with "`NOTE: +edited after generation`". + +# CryptOpt + +Files in the `asm` directory are compiled from Fiat-Cryptography templates +using [CryptOpt](https://github.com/0xADE1A1DE/CryptOpt). These generated +assembly files have been edited to support call-stack unwinding. The modified +files have been checked for functional correctness using the CryptOpt +translation validator that is included in the Fiat-Cryptography repository. +Correct unwinding and manual assembler-directive changes related to object-file +conventions are validated using unit tests. diff --git a/third_party/fiat/asm/fiat_curve25519_adx_mul.S b/third_party/fiat/asm/fiat_curve25519_adx_mul.S new file mode 100644 index 0000000000..f4c70dd41e --- /dev/null +++ b/third_party/fiat/asm/fiat_curve25519_adx_mul.S @@ -0,0 +1,169 @@ +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ + (defined(__APPLE__) || defined(__ELF__)) + +.intel_syntax noprefix +.text +#if defined(__APPLE__) +.private_extern _fiat_curve25519_adx_mul +.global _fiat_curve25519_adx_mul +_fiat_curve25519_adx_mul: +#else +.type fiat_curve25519_adx_mul, @function +.hidden fiat_curve25519_adx_mul +.global fiat_curve25519_adx_mul +fiat_curve25519_adx_mul: +#endif + +.cfi_startproc +mov [rsp - 0x08], rbp +.cfi_offset rbp, -8-0x08 +mov rbp, rsp + +mov rax, rdx +mov rdx, [ rsi + 0x18 ] +mulx r11, r10, [ rax + 0x8 ] +mov rdx, [ rax + 0x0 ] +mov [ rsp - 0x58 ], r15 +.cfi_offset r15, -8-0x58 +mulx r8, rcx, [ rsi + 0x18 ] +mov rdx, [ rsi + 0x8 ] +mov [ rsp - 0x80 ], rbx +.cfi_offset rbx, -8-0x80 +mulx rbx, r9, [ rax + 0x18 ] +mov rdx, [ rsi + 0x8 ] +mov [ rsp - 0x70 ], r12 +.cfi_offset r12, -8-0x70 +mulx r15, r12, [ rax + 0x8 ] +mov rdx, [ rsi + 0x0 ] +mov [ rsp - 0x68 ], r13 +.cfi_offset r13, -8-0x68 +mov [ rsp - 0x60 ], r14 +.cfi_offset r14, -8-0x60 +mulx r14, r13, [ rax + 0x0 ] +mov rdx, [ rax + 0x10 ] +mov [ rsp - 0x18 ], r15 +mov [ rsp - 0x50 ], rdi +mulx rdi, r15, [ rsi + 0x0 ] +mov rdx, [ rax + 0x18 ] +mov [ rsp - 0x48 ], r13 +mov [ rsp - 0x40 ], r9 +mulx r9, r13, [ rsi + 0x0 ] +test al, al +adox rcx, rdi +mov rdx, [ rsi + 0x10 ] +mov [ rsp - 0x38 ], r13 +mulx r13, rdi, [ rax + 0x8 ] +adox r10, r9 +mov rdx, 0x0 +adox rbx, rdx +adcx rdi, rcx +adcx r8, r10 +mov r9, rdx +adcx r9, rbx +mov rdx, [ rsi + 0x10 ] +mulx r10, rcx, [ rax + 0x0 ] +mov rdx, [ rsi + 0x0 ] +mov [ rsp - 0x30 ], r15 +mulx r15, rbx, [ rax + 0x8 ] +mov rdx, -0x2 +inc rdx +adox rcx, r15 +setc r15b +clc +adcx rcx, r12 +adox r10, rdi +mov rdx, [ rax + 0x10 ] +mov [ rsp - 0x78 ], rcx +mulx rcx, rdi, [ rsi + 0x10 ] +adox rdi, r8 +mov rdx, [ rax + 0x18 ] +mov [ rsp - 0x28 ], rcx +mulx rcx, r8, [ rsi + 0x10 ] +mov rdx, [ rax + 0x10 ] +mov [ rsp - 0x20 ], r8 +mulx r12, r8, [ rsi + 0x18 ] +adox r8, r9 +mov rdx, [ rsi + 0x8 ] +mov [ rsp - 0x10 ], r12 +mulx r12, r9, [ rax + 0x10 ] +movzx rdx, r15b +lea rdx, [ rdx + rcx ] +adcx r9, r10 +adcx r13, rdi +mov r15, 0x0 +mov r10, r15 +adox r10, rdx +mov rdx, [ rax + 0x18 ] +mulx rcx, rdi, [ rsi + 0x18 ] +adox rcx, r15 +adcx r11, r8 +mov rdx, r15 +adcx rdx, r10 +adcx rcx, r15 +mov r8, rdx +mov rdx, [ rax + 0x0 ] +mulx r15, r10, [ rsi + 0x8 ] +test al, al +adox r10, r14 +adcx rbx, r10 +adox r15, [ rsp - 0x78 ] +adcx r15, [ rsp - 0x30 ] +adox r9, [ rsp - 0x18 ] +adcx r9, [ rsp - 0x38 ] +adox r13, [ rsp - 0x40 ] +adcx r12, r13 +adox r11, [ rsp - 0x20 ] +adcx r11, [ rsp - 0x28 ] +mov rdx, 0x26 +mulx rsi, r14, r12 +adox rdi, r8 +adcx rdi, [ rsp - 0x10 ] +mulx r10, r8, r11 +mov r13, 0x0 +adox rcx, r13 +adcx rcx, r13 +mulx r11, r12, rdi +xor rdi, rdi +adox r8, rbx +adox r12, r15 +mulx rbx, r13, rcx +adcx r14, [ rsp - 0x48 ] +adox r13, r9 +adox rbx, rdi +adcx rsi, r8 +adcx r10, r12 +adcx r11, r13 +adc rbx, 0x0 +mulx r9, r15, rbx +xor r9, r9 +adox r15, r14 +mov rdi, r9 +adox rdi, rsi +mov rcx, r9 +adox rcx, r10 +mov r8, [ rsp - 0x50 ] +mov [ r8 + 0x8 ], rdi +mov r12, r9 +adox r12, r11 +mov r14, r9 +cmovo r14, rdx +mov [ r8 + 0x18 ], r12 +adcx r15, r14 +mov [ r8 + 0x0 ], r15 +mov [ r8 + 0x10 ], rcx +mov rbx, [ rsp - 0x80 ] +mov r12, [ rsp - 0x70 ] +mov r13, [ rsp - 0x68 ] +mov r14, [ rsp - 0x60 ] +mov r15, [ rsp - 0x58 ] + +mov rbp, [rsp - 0x08] +ret +.cfi_endproc +#if defined(__ELF__) +.size fiat_curve25519_adx_mul, .-fiat_curve25519_adx_mul +#endif + +#endif diff --git a/third_party/fiat/asm/fiat_curve25519_adx_square.S b/third_party/fiat/asm/fiat_curve25519_adx_square.S new file mode 100644 index 0000000000..9b1fdb9cf5 --- /dev/null +++ b/third_party/fiat/asm/fiat_curve25519_adx_square.S @@ -0,0 +1,137 @@ +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ + (defined(__APPLE__) || defined(__ELF__)) + +.intel_syntax noprefix +.text +#if defined(__APPLE__) +.private_extern _fiat_curve25519_adx_square +.global _fiat_curve25519_adx_square +_fiat_curve25519_adx_square: +#else +.type fiat_curve25519_adx_square, @function +.hidden fiat_curve25519_adx_square +.global fiat_curve25519_adx_square +fiat_curve25519_adx_square: +#endif + +.cfi_startproc +mov [rsp - 0x08], rbp +.cfi_offset rbp, -8-0x08 +mov rbp, rsp + +mov rdx, [ rsi + 0x0 ] +mulx r10, rax, [ rsi + 0x8 ] +mov rdx, [ rsi + 0x0 ] +mulx rcx, r11, [ rsi + 0x10 ] +xor rdx, rdx +adox r11, r10 +mov rdx, [ rsi + 0x0 ] +mulx r9, r8, [ rsi + 0x18 ] +mov rdx, [ rsi + 0x8 ] +mov [ rsp - 0x80 ], rbx +.cfi_offset rbx, -8-0x80 +mulx rbx, r10, [ rsi + 0x18 ] +adox r8, rcx +mov [rsp - 0x48 ], rdi +adox r10, r9 +adcx rax, rax +mov rdx, [ rsi + 0x10 ] +mulx r9, rcx, [ rsi + 0x18 ] +adox rcx, rbx +mov rdx, [ rsi + 0x10 ] +mulx rdi, rbx, [ rsi + 0x8 ] +mov rdx, 0x0 +adox r9, rdx +mov [ rsp - 0x70 ], r12 +.cfi_offset r12, -8-0x70 +mov r12, -0x3 +inc r12 +adox rbx, r8 +adox rdi, r10 +adcx r11, r11 +mov r8, rdx +adox r8, rcx +mov r10, rdx +adox r10, r9 +adcx rbx, rbx +mov rdx, [ rsi + 0x0 ] +mulx r9, rcx, rdx +mov rdx, [ rsi + 0x8 ] +mov [ rsp - 0x68 ], r13 +.cfi_offset r13, -8-0x68 +mov [ rsp - 0x60 ], r14 +.cfi_offset r14, -8-0x60 +mulx r14, r13, rdx +seto dl +inc r12 +adox r9, rax +adox r13, r11 +adox r14, rbx +adcx rdi, rdi +mov al, dl +mov rdx, [ rsi + 0x10 ] +mulx rbx, r11, rdx +adox r11, rdi +adcx r8, r8 +adox rbx, r8 +adcx r10, r10 +movzx rdx, al +mov rdi, 0x0 +adcx rdx, rdi +movzx r8, al +lea r8, [ r8 + rdx ] +mov rdx, [ rsi + 0x18 ] +mulx rdi, rax, rdx +adox rax, r10 +mov rdx, 0x26 +mov [ rsp - 0x58 ], r15 +.cfi_offset r15, -8-0x58 +mulx r15, r10, r11 +clc +adcx r10, rcx +mulx r11, rcx, rbx +adox r8, rdi +mulx rdi, rbx, r8 +inc r12 +adox rcx, r9 +mulx r8, r9, rax +adcx r15, rcx +adox r9, r13 +adcx r11, r9 +adox rbx, r14 +adox rdi, r12 +adcx r8, rbx +adc rdi, 0x0 +mulx r14, r13, rdi +test al, al +mov rdi, [ rsp - 0x48 ] +adox r13, r10 +mov r14, r12 +adox r14, r15 +mov [ rdi + 0x8 ], r14 +mov rax, r12 +adox rax, r11 +mov r10, r12 +adox r10, r8 +mov [ rdi + 0x10 ], rax +mov rcx, r12 +cmovo rcx, rdx +adcx r13, rcx +mov [ rdi + 0x0 ], r13 +mov [ rdi + 0x18 ], r10 +mov rbx, [ rsp - 0x80 ] +mov r12, [ rsp - 0x70 ] +mov r13, [ rsp - 0x68 ] +mov r14, [ rsp - 0x60 ] +mov r15, [ rsp - 0x58 ] + +mov rbp, [rsp - 0x08] +ret +.cfi_endproc +#if defined(__ELF__) +.size fiat_curve25519_adx_square, .-fiat_curve25519_adx_square +#endif + +#endif diff --git a/third_party/fiat/curve25519_64_adx.h b/third_party/fiat/curve25519_64_adx.h new file mode 100644 index 0000000000..f50f5b8377 --- /dev/null +++ b/third_party/fiat/curve25519_64_adx.h @@ -0,0 +1,691 @@ +#include +#include +#include +#include + +typedef uint64_t fe4[4]; +typedef uint8_t fiat_uint1; +typedef int8_t fiat_int1; + +static __inline__ uint64_t fiat_value_barrier_u64(uint64_t a) { + __asm__("" : "+r"(a) : /* no inputs */); + return a; +} + +__attribute__((target("adx,bmi2"))) +static inline void fe4_mul(fe4 out, const fe4 x, const fe4 y) { fiat_curve25519_adx_mul(out, x, y); } + +__attribute__((target("adx,bmi2"))) +static inline void fe4_sq(fe4 out, const fe4 x) { fiat_curve25519_adx_square(out, x); } + +/* + * The function fiat_mulx_u64 is a multiplication, returning the full double-width result. + * + * Postconditions: + * out1 = (arg1 * arg2) mod 2^64 + * out2 = ⌊arg1 * arg2 / 2^64⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0xffffffffffffffff] + * arg2: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + * out2: [0x0 ~> 0xffffffffffffffff] + */ +__attribute__((target("adx,bmi2"))) +static inline void fiat_mulx_u64(uint64_t* out1, uint64_t* out2, uint64_t arg1, uint64_t arg2) { +// NOTE: edited after generation +#if defined(_M_X64) + unsigned long long t; + *out1 = _umul128(arg1, arg2, &t); + *out2 = t; +#elif defined(_M_ARM64) + *out1 = arg1 * arg2; + *out2 = __umulh(arg1, arg2); +#else + unsigned __int128 t = (unsigned __int128)arg1 * arg2; + *out1 = t; + *out2 = (t >> 64); +#endif +} + +/* + * The function fiat_addcarryx_u64 is an addition with carry. + * + * Postconditions: + * out1 = (arg1 + arg2 + arg3) mod 2^64 + * out2 = ⌊(arg1 + arg2 + arg3) / 2^64⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffffffffffff] + * arg3: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + * out2: [0x0 ~> 0x1] + */ +__attribute__((target("adx,bmi2"))) +static inline void fiat_addcarryx_u64(uint64_t* out1, fiat_uint1* out2, fiat_uint1 arg1, uint64_t arg2, uint64_t arg3) { +// NOTE: edited after generation +#if defined(__has_builtin) +# if __has_builtin(__builtin_ia32_addcarryx_u64) +# define addcarry64 __builtin_ia32_addcarryx_u64 +# endif +#endif +#if defined(addcarry64) + long long unsigned int t; + *out2 = addcarry64(arg1, arg2, arg3, &t); + *out1 = t; +#elif defined(_M_X64) + long long unsigned int t; + *out2 = _addcarry_u64(arg1, arg2, arg3, out1); + *out1 = t; +#else + arg2 += arg1; + arg1 = arg2 < arg1; + uint64_t ret = arg2 + arg3; + arg1 += ret < arg2; + *out1 = ret; + *out2 = arg1; +#endif +#undef addcarry64 +} + +/* + * The function fiat_subborrowx_u64 is a subtraction with borrow. + * + * Postconditions: + * out1 = (-arg1 + arg2 + -arg3) mod 2^64 + * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^64⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffffffffffff] + * arg3: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + * out2: [0x0 ~> 0x1] + */ +__attribute__((target("adx,bmi2"))) +static inline void fiat_subborrowx_u64(uint64_t* out1, fiat_uint1* out2, fiat_uint1 arg1, uint64_t arg2, uint64_t arg3) { +#if defined(__has_builtin) +# if __has_builtin(__builtin_ia32_subborrow_u64) +# define subborrow64 __builtin_ia32_subborrow_u64 +# endif +#endif +#if defined(subborrow64) + long long unsigned int t; + *out2 = subborrow64(arg1, arg2, arg3, &t); + *out1 = t; +#elif defined(_M_X64) + long long unsigned int t; + *out2 = _subborrow_u64(arg1, arg2, arg3, &t); // NOTE: edited after generation + *out1 = t; +#else + *out1 = arg2 - arg3 - arg1; + *out2 = (arg2 < arg3) | ((arg2 == arg3) & arg1); +#endif +#undef subborrow64 +} + +/* + * The function fiat_cmovznz_u64 is a single-word conditional move. + * + * Postconditions: + * out1 = (if arg1 = 0 then arg2 else arg3) + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffffffffffff] + * arg3: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + */ +__attribute__((target("adx,bmi2"))) +static inline void fiat_cmovznz_u64(uint64_t* out1, fiat_uint1 arg1, uint64_t arg2, uint64_t arg3) { + fiat_uint1 x1; + uint64_t x2; + uint64_t x3; + x1 = (!(!arg1)); + x2 = ((fiat_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff)); + x3 = ((fiat_value_barrier_u64(x2) & arg3) | (fiat_value_barrier_u64((~x2)) & arg2)); + *out1 = x3; +} + +/* + * Input Bounds: + * arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * Output Bounds: + * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + */ +__attribute__((target("adx,bmi2"))) +static void fe4_add(uint64_t out1[4], const uint64_t arg1[4], const uint64_t arg2[4]) { + uint64_t x1; + fiat_uint1 x2; + uint64_t x3; + fiat_uint1 x4; + uint64_t x5; + fiat_uint1 x6; + uint64_t x7; + fiat_uint1 x8; + uint64_t x9; + uint64_t x10; + fiat_uint1 x11; + uint64_t x12; + fiat_uint1 x13; + uint64_t x14; + fiat_uint1 x15; + uint64_t x16; + fiat_uint1 x17; + uint64_t x18; + uint64_t x19; + fiat_uint1 x20; + fiat_addcarryx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0])); + fiat_addcarryx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1])); + fiat_addcarryx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2])); + fiat_addcarryx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3])); + fiat_cmovznz_u64(&x9, x8, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and + fiat_addcarryx_u64(&x10, &x11, 0x0, x1, x9); + fiat_addcarryx_u64(&x12, &x13, x11, x3, 0x0); + fiat_addcarryx_u64(&x14, &x15, x13, x5, 0x0); + fiat_addcarryx_u64(&x16, &x17, x15, x7, 0x0); + fiat_cmovznz_u64(&x18, x17, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and + fiat_addcarryx_u64(&x19, &x20, 0x0, x10, x18); + out1[0] = x19; + out1[1] = x12; + out1[2] = x14; + out1[3] = x16; +} + +/* + * Input Bounds: + * arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * Output Bounds: + * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + */ +__attribute__((target("adx,bmi2"))) +static void fe4_sub(uint64_t out1[4], const uint64_t arg1[4], const uint64_t arg2[4]) { + uint64_t x1; + uint64_t x2; + fiat_uint1 x3; + uint64_t x4; + uint64_t x5; + fiat_uint1 x6; + uint64_t x7; + uint64_t x8; + fiat_uint1 x9; + uint64_t x10; + uint64_t x11; + fiat_uint1 x12; + uint64_t x13; + uint64_t x14; + fiat_uint1 x15; + uint64_t x16; + fiat_uint1 x17; + uint64_t x18; + fiat_uint1 x19; + uint64_t x20; + fiat_uint1 x21; + uint64_t x22; + uint64_t x23; + fiat_uint1 x24; + x1 = (arg2[0]); + fiat_subborrowx_u64(&x2, &x3, 0x0, (arg1[0]), x1); + x4 = (arg2[1]); + fiat_subborrowx_u64(&x5, &x6, x3, (arg1[1]), x4); + x7 = (arg2[2]); + fiat_subborrowx_u64(&x8, &x9, x6, (arg1[2]), x7); + x10 = (arg2[3]); + fiat_subborrowx_u64(&x11, &x12, x9, (arg1[3]), x10); + fiat_cmovznz_u64(&x13, x12, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and + fiat_subborrowx_u64(&x14, &x15, 0x0, x2, x13); + fiat_subborrowx_u64(&x16, &x17, x15, x5, 0x0); + fiat_subborrowx_u64(&x18, &x19, x17, x8, 0x0); + fiat_subborrowx_u64(&x20, &x21, x19, x11, 0x0); + fiat_cmovznz_u64(&x22, x21, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and + fiat_subborrowx_u64(&x23, &x24, 0x0, x14, x22); + out1[0] = x23; + out1[1] = x16; + out1[2] = x18; + out1[3] = x20; +} + +/* + * Input Bounds: + * arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * arg2: [0x0 ~> 0x3ffffffffffffff] // NOTE: this is not any uint64! + * Output Bounds: + * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + */ +__attribute__((target("adx,bmi2"))) +static void fe4_scmul(uint64_t out1[4], const uint64_t arg1[4], uint64_t arg2) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + fiat_uint1 x6; + uint64_t x7; + uint64_t x8; + uint64_t x9; + fiat_uint1 x10; + uint64_t x11; + uint64_t x12; + uint64_t x13; + fiat_uint1 x14; + uint64_t x15; + uint64_t x16; + uint64_t x17; + fiat_uint1 x18; + uint64_t x19; + fiat_uint1 x20; + uint64_t x21; + fiat_uint1 x22; + uint64_t x23; + fiat_uint1 x24; + uint64_t x25; + uint64_t x26; + fiat_uint1 x27; + fiat_mulx_u64(&x1, &x2, (arg1[0]), arg2); + fiat_mulx_u64(&x3, &x4, (arg1[1]), arg2); + fiat_addcarryx_u64(&x5, &x6, 0x0, x2, x3); + fiat_mulx_u64(&x7, &x8, (arg1[2]), arg2); + fiat_addcarryx_u64(&x9, &x10, x6, x4, x7); + fiat_mulx_u64(&x11, &x12, (arg1[3]), arg2); + fiat_addcarryx_u64(&x13, &x14, x10, x8, x11); + fiat_mulx_u64(&x15, &x16, (x12 + (uint64_t)x14), UINT8_C(0x26)); + fiat_addcarryx_u64(&x17, &x18, 0x0, x1, x15); + fiat_addcarryx_u64(&x19, &x20, x18, x5, 0x0); + fiat_addcarryx_u64(&x21, &x22, x20, x9, 0x0); + fiat_addcarryx_u64(&x23, &x24, x22, x13, 0x0); + fiat_cmovznz_u64(&x25, x24, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and + fiat_addcarryx_u64(&x26, &x27, 0x0, x17, x25); + out1[0] = x26; + out1[1] = x19; + out1[2] = x21; + out1[3] = x23; +} + +/* + * Input Bounds: + * arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * Output Bounds: + * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + */ +__attribute__((target("adx,bmi2"))) +static void fe4_canon(uint64_t out1[4], const uint64_t arg1[4]) { + uint64_t x1; + fiat_uint1 x2; + uint64_t x3; + fiat_uint1 x4; + uint64_t x5; + fiat_uint1 x6; + uint64_t x7; + fiat_uint1 x8; + uint64_t x9; + uint64_t x10; + uint64_t x11; + uint64_t x12; + uint64_t x13; + fiat_uint1 x14; + uint64_t x15; + fiat_uint1 x16; + uint64_t x17; + fiat_uint1 x18; + uint64_t x19; + fiat_uint1 x20; + uint64_t x21; + uint64_t x22; + uint64_t x23; + uint64_t x24; + fiat_subborrowx_u64(&x1, &x2, 0x0, (arg1[0]), UINT64_C(0xffffffffffffffed)); + fiat_subborrowx_u64(&x3, &x4, x2, (arg1[1]), UINT64_C(0xffffffffffffffff)); + fiat_subborrowx_u64(&x5, &x6, x4, (arg1[2]), UINT64_C(0xffffffffffffffff)); + fiat_subborrowx_u64(&x7, &x8, x6, (arg1[3]), UINT64_C(0x7fffffffffffffff)); + fiat_cmovznz_u64(&x9, x8, x1, (arg1[0])); + fiat_cmovznz_u64(&x10, x8, x3, (arg1[1])); + fiat_cmovznz_u64(&x11, x8, x5, (arg1[2])); + fiat_cmovznz_u64(&x12, x8, x7, (arg1[3])); + fiat_subborrowx_u64(&x13, &x14, 0x0, x9, UINT64_C(0xffffffffffffffed)); + fiat_subborrowx_u64(&x15, &x16, x14, x10, UINT64_C(0xffffffffffffffff)); + fiat_subborrowx_u64(&x17, &x18, x16, x11, UINT64_C(0xffffffffffffffff)); + fiat_subborrowx_u64(&x19, &x20, x18, x12, UINT64_C(0x7fffffffffffffff)); + fiat_cmovznz_u64(&x21, x20, x13, x9); + fiat_cmovznz_u64(&x22, x20, x15, x10); + fiat_cmovznz_u64(&x23, x20, x17, x11); + fiat_cmovznz_u64(&x24, x20, x19, x12); + out1[0] = x21; + out1[1] = x22; + out1[2] = x23; + out1[3] = x24; +} + +/* + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * arg3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * Output Bounds: + * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * out2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + */ +__attribute__((target("adx,bmi2"))) +static void fe4_cswap(uint64_t out1[4], uint64_t out2[4], fiat_uint1 arg1, const uint64_t arg2[4], const uint64_t arg3[4]) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + // NOTE: clang 14 for Zen 2 uses YMM registers + fiat_cmovznz_u64(&x1, arg1, (arg2[0]), (arg3[0])); + fiat_cmovznz_u64(&x2, arg1, (arg2[1]), (arg3[1])); + fiat_cmovznz_u64(&x3, arg1, (arg2[2]), (arg3[2])); + fiat_cmovznz_u64(&x4, arg1, (arg2[3]), (arg3[3])); + fiat_cmovznz_u64(&x5, arg1, (arg3[0]), (arg2[0])); + fiat_cmovznz_u64(&x6, arg1, (arg3[1]), (arg2[1])); + fiat_cmovznz_u64(&x7, arg1, (arg3[2]), (arg2[2])); + fiat_cmovznz_u64(&x8, arg1, (arg3[3]), (arg2[3])); + out1[0] = x1; + out1[1] = x2; + out1[2] = x3; + out1[3] = x4; + out2[0] = x5; + out2[1] = x6; + out2[2] = x7; + out2[3] = x8; +} + +// The following functions are adaped from crypto/curve25519/curve25519.c +// It would be desirable to share the code, but with the current field +// implementations both 4-limb and 5-limb versions of the curve-level code need +// to be included in builds targetting an unknown variant of x86_64. + +__attribute__((target("adx,bmi2"))) +static void fe4_invert(fe4 out, const fe4 z) { + fe4 t0; + fe4 t1; + fe4 t2; + fe4 t3; + int i; + + fe4_sq(t0, z); + fe4_sq(t1, t0); + for (i = 1; i < 2; ++i) { + fe4_sq(t1, t1); + } + fe4_mul(t1, z, t1); + fe4_mul(t0, t0, t1); + fe4_sq(t2, t0); + fe4_mul(t1, t1, t2); + fe4_sq(t2, t1); + for (i = 1; i < 5; ++i) { + fe4_sq(t2, t2); + } + fe4_mul(t1, t2, t1); + fe4_sq(t2, t1); + for (i = 1; i < 10; ++i) { + fe4_sq(t2, t2); + } + fe4_mul(t2, t2, t1); + fe4_sq(t3, t2); + for (i = 1; i < 20; ++i) { + fe4_sq(t3, t3); + } + fe4_mul(t2, t3, t2); + fe4_sq(t2, t2); + for (i = 1; i < 10; ++i) { + fe4_sq(t2, t2); + } + fe4_mul(t1, t2, t1); + fe4_sq(t2, t1); + for (i = 1; i < 50; ++i) { + fe4_sq(t2, t2); + } + fe4_mul(t2, t2, t1); + fe4_sq(t3, t2); + for (i = 1; i < 100; ++i) { + fe4_sq(t3, t3); + } + fe4_mul(t2, t3, t2); + fe4_sq(t2, t2); + for (i = 1; i < 50; ++i) { + fe4_sq(t2, t2); + } + fe4_mul(t1, t2, t1); + fe4_sq(t1, t1); + for (i = 1; i < 5; ++i) { + fe4_sq(t1, t1); + } + fe4_mul(out, t1, t0); +} + +__attribute__((target("adx,bmi2"))) +void x25519_scalar_mult_adx(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]) { + uint8_t e[32]; + memcpy(e, scalar, 32); + e[0] &= 248; + e[31] &= 127; + e[31] |= 64; + + // The following implementation was transcribed to Coq and proven to + // correspond to unary scalar multiplication in affine coordinates given that + // x1 != 0 is the x coordinate of some point on the curve. It was also checked + // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2 + // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the + // underlying field, so it applies to Curve25519 itself and the quadratic + // twist of Curve25519. It was not proven in Coq that prime-field arithmetic + // correctly simulates extension-field arithmetic on prime-field values. + // The decoding of the byte array representation of e was not considered. + // Specification of Montgomery curves in affine coordinates: + // + // Proof that these form a group that is isomorphic to a Weierstrass curve: + // + // Coq transcription and correctness proof of the loop (where scalarbits=255): + // + // + // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0 + fe4 x1, x2 = {1}, z2 = {0}, x3, z3 = {1}, tmp0, tmp1; + OPENSSL_memcpy(x1, point, sizeof(fe4)); + x1[3] &= (uint64_t)(-1)>>1; + OPENSSL_memcpy(x3, x1, sizeof(fe4)); + + unsigned swap = 0; + int pos; + for (pos = 254; pos >= 0; --pos) { + // loop invariant as of right before the test, for the case where x1 != 0: + // pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero + // let r := e >> (pos+1) in the following equalities of projective points: + // to_xz (r*P) === if swap then (x3, z3) else (x2, z2) + // to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3) + // x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P) + unsigned b = 1 & (e[pos / 8] >> (pos & 7)); + swap ^= b; + fe4_cswap(x2, x3, swap, x2, x3); + fe4_cswap(z2, z3, swap, z2, z3); + swap = b; + // Coq transcription of ladderstep formula (called from transcribed loop): + // + // + // x1 != 0 + // x1 = 0 + fe4_sub(tmp0, x3, z3); + fe4_sub(tmp1, x2, z2); + fe4_add(x2, x2, z2); + fe4_add(z2, x3, z3); + fe4_mul(z3, tmp0, x2); + fe4_mul(z2, z2, tmp1); + fe4_sq(tmp0, tmp1); + fe4_sq(tmp1, x2); + fe4_add(x3, z3, z2); + fe4_sub(z2, z3, z2); + fe4_mul(x2, tmp1, tmp0); + fe4_sub(tmp1, tmp1, tmp0); + fe4_sq(z2, z2); + fe4_scmul(z3, tmp1, 121666); + fe4_sq(x3, x3); + fe4_add(tmp0, tmp0, z3); + fe4_mul(z3, x1, z2); + fe4_mul(z2, tmp1, tmp0); + } + // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2) + fe4_cswap(x2, x3, swap, x2, x3); + fe4_cswap(z2, z3, swap, z2, z3); + + fe4_invert(z2, z2); + fe4_mul(x2, x2, z2); + fe4_canon(x2, x2); + OPENSSL_memcpy(out, x2, sizeof(fe4)); +} + +typedef struct { + fe4 X; + fe4 Y; + fe4 Z; + fe4 T; +} ge_p3_4; + +typedef struct { + fe4 yplusx; + fe4 yminusx; + fe4 xy2d; +} ge_precomp_4; + +__attribute__((target("adx,bmi2"))) +static void inline_x25519_ge_dbl_4(ge_p3_4 *r, const ge_p3_4 *p, bool skip_t) { + // Transcribed from a Coq function proven against affine coordinates. + // https://github.com/mit-plv/fiat-crypto/blob/9943ba9e7d8f3e1c0054b2c94a5edca46ea73ef8/src/Curves/Edwards/XYZT/Basic.v#L136-L165 + fe4 trX, trZ, trT, t0, cX, cY, cZ, cT; + fe4_sq(trX, p->X); + fe4_sq(trZ, p->Y); + fe4_sq(trT, p->Z); + fe4_add(trT, trT, trT); + fe4_add(cY, p->X, p->Y); + fe4_sq(t0, cY); + fe4_add(cY, trZ, trX); + fe4_sub(cZ, trZ, trX); + fe4_sub(cX, t0, cY); + fe4_sub(cT, trT, cZ); + fe4_mul(r->X, cX, cT); + fe4_mul(r->Y, cY, cZ); + fe4_mul(r->Z, cZ, cT); + if (!skip_t) { + fe4_mul(r->T, cX, cY); + } +} + +__attribute__((target("adx,bmi2"))) +__attribute__((always_inline)) // 4% speedup with clang14 and zen2 +static inline void +ge_p3_add_p3_precomp_4(ge_p3_4 *r, const ge_p3_4 *p, const ge_precomp_4 *q) { + fe4 A, B, C, YplusX, YminusX, D, X3, Y3, Z3, T3; + // Transcribed from a Coq function proven against affine coordinates. + // https://github.com/mit-plv/fiat-crypto/blob/a36568d1d73aff5d7accc79fd28be672882f9c17/src/Curves/Edwards/XYZT/Precomputed.v#L38-L56 + fe4_add(YplusX, p->Y, p->X); + fe4_sub(YminusX, p->Y, p->X); + fe4_mul(A, YplusX, q->yplusx); + fe4_mul(B, YminusX, q->yminusx); + fe4_mul(C, q->xy2d, p->T); + fe4_add(D, p->Z, p->Z); + fe4_sub(X3, A, B); + fe4_add(Y3, A, B); + fe4_add(Z3, D, C); + fe4_sub(T3, D, C); + fe4_mul(r->X, X3, T3); + fe4_mul(r->Y, Y3, Z3); + fe4_mul(r->Z, Z3, T3); + fe4_mul(r->T, X3, Y3); +} + +__attribute__((always_inline)) // 25% speedup with clang14 and zen2 +static inline void table_select_4(ge_precomp_4 *t, const int pos, + const signed char b) { + uint8_t bnegative = constant_time_msb_w(b); + uint8_t babs = b - ((bnegative & b) << 1); + + uint8_t t_bytes[3][32] = { + {constant_time_is_zero_w(b) & 1}, {constant_time_is_zero_w(b) & 1}, {0}}; +#if defined(__clang__) + __asm__("" : "+m" (t_bytes) : /*no inputs*/); +#endif + static_assert(sizeof(t_bytes) == sizeof(k25519Precomp[pos][0]), ""); + for (int i = 0; i < 8; i++) { + constant_time_conditional_memxor(t_bytes, k25519Precomp[pos][i], + sizeof(t_bytes), + constant_time_eq_w(babs, 1 + i)); + } + + static_assert(sizeof(t_bytes) == sizeof(ge_precomp_4), ""); + + // fe4 uses saturated 64-bit limbs, so converting from bytes is just a copy. + OPENSSL_memcpy(t, t_bytes, sizeof(ge_precomp_4)); + + fe4 xy2d_neg = {0}; + fe4_sub(xy2d_neg, xy2d_neg, t->xy2d); + constant_time_conditional_memcpy(t->yplusx, t_bytes[1], sizeof(fe4), + bnegative); + constant_time_conditional_memcpy(t->yminusx, t_bytes[0], sizeof(fe4), + bnegative); + constant_time_conditional_memcpy(t->xy2d, xy2d_neg, sizeof(fe4), bnegative); +} + +// h = a * B +// where a = a[0]+256*a[1]+...+256^31 a[31] +// B is the Ed25519 base point (x,4/5) with x positive. +// +// Preconditions: +// a[31] <= 127 +__attribute__((target("adx,bmi2"))) +void x25519_ge_scalarmult_base_adx(uint8_t h[4][32], const uint8_t a[32]) { + signed char e[64]; + signed char carry; + + for (unsigned i = 0; i < 32; ++i) { + e[2 * i + 0] = (a[i] >> 0) & 15; + e[2 * i + 1] = (a[i] >> 4) & 15; + } + // each e[i] is between 0 and 15 + // e[63] is between 0 and 7 + + carry = 0; + for (unsigned i = 0; i < 63; ++i) { + e[i] += carry; + carry = e[i] + 8; + carry >>= 4; + e[i] -= carry << 4; + } + e[63] += carry; + // each e[i] is between -8 and 8 + + ge_p3_4 r = {{0}, {1}, {1}, {0}}; + for (unsigned i = 1; i < 64; i += 2) { + ge_precomp_4 t; + table_select_4(&t, i / 2, e[i]); + ge_p3_add_p3_precomp_4(&r, &r, &t); + } + + inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/true); + inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/true); + inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/true); + inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/false); + + for (unsigned i = 0; i < 64; i += 2) { + ge_precomp_4 t; + table_select_4(&t, i / 2, e[i]); + ge_p3_add_p3_precomp_4(&r, &r, &t); + } + + // fe4 uses saturated 64-bit limbs, so converting to bytes is just a copy. + // Satisfy stated precondition of fiat_25519_from_bytes; tests pass either way + fe4_canon(r.X, r.X); + fe4_canon(r.Y, r.Y); + fe4_canon(r.Z, r.Z); + fe4_canon(r.T, r.T); + static_assert(sizeof(ge_p3_4) == sizeof(uint8_t[4][32]), ""); + OPENSSL_memcpy(h, &r, sizeof(ge_p3_4)); +}