diff --git a/Cargo.toml b/Cargo.toml
index e2835e252b..e095f9efc5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -110,10 +110,12 @@ include = [
     "examples/**/*.rs",
     "include/ring-core/aes.h",
     "include/ring-core/arm_arch.h",
+    "include/ring-core/asm_base.h",
     "include/ring-core/base.h",
     "include/ring-core/check.h",
     "include/ring-core/mem.h",
     "include/ring-core/poly1305.h",
+    "include/ring-core/target.h",
     "include/ring-core/type_check.h",
     "src/**/*.rs",
     "src/aead/poly1305_test.txt",
diff --git a/crypto/curve25519/asm/x25519-asm-arm.S b/crypto/curve25519/asm/x25519-asm-arm.S
index 04d0362bdd..8a51bb9bdf 100644
--- a/crypto/curve25519/asm/x25519-asm-arm.S
+++ b/crypto/curve25519/asm/x25519-asm-arm.S
@@ -17,15 +17,9 @@
  * domain licensed but the standard ISC license is included above to keep
  * licensing simple. */
 
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
-#define OPENSSL_NO_ASM
-#endif
-#endif
+#include <ring-core/asm_base.h>
 
-#if !defined(OPENSSL_NO_ASM) && defined(__ARMEL__) && defined(__ELF__)
-
-#include "ring_core_generated/prefix_symbols_asm.h"
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
 
 .fpu neon
 .text
@@ -2127,8 +2121,4 @@ mov sp,r12
 vpop {q4,q5,q6,q7}
 bx lr
 
-#endif  /* !OPENSSL_NO_ASM && __ARMEL__ && __ELF__ */
-
-#if defined(__ELF__)
-.section	.note.GNU-stack,"",%progbits
-#endif
+#endif  /* !OPENSSL_NO_ASM && OPENSSL_ARM && __ELF__ */
diff --git a/crypto/curve25519/curve25519.c b/crypto/curve25519/curve25519.c
index e8178e0aea..1ab754d3af 100644
--- a/crypto/curve25519/curve25519.c
+++ b/crypto/curve25519/curve25519.c
@@ -778,6 +778,18 @@ static void table_select(ge_precomp *t, const int pos, const signed char b) {
 // Preconditions:
 //   a[31] <= 127
 void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
+#if defined(BORINGSSL_FE25519_ADX)
+  if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&
+      CRYPTO_is_ADX_capable()) {
+    uint8_t t[4][32];
+    x25519_ge_scalarmult_base_adx(t, a);
+    fiat_25519_from_bytes(h->X.v, t[0]);
+    fiat_25519_from_bytes(h->Y.v, t[1]);
+    fiat_25519_from_bytes(h->Z.v, t[2]);
+    fiat_25519_from_bytes(h->T.v, t[3]);
+    return;
+  }
+#endif
   signed char e[64];
   signed char carry;
   ge_p1p1 r;
diff --git a/crypto/curve25519/curve25519_64_adx.c b/crypto/curve25519/curve25519_64_adx.c
new file mode 100644
index 0000000000..2768989643
--- /dev/null
+++ b/crypto/curve25519/curve25519_64_adx.c
@@ -0,0 +1,18 @@
+/* Copyright (c) 2023, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include "internal.h"
+#if defined(BORINGSSL_FE25519_ADX)
+#include "../../third_party/fiat/curve25519_64_adx.h"
+#endif
diff --git a/crypto/internal.h b/crypto/internal.h
index fe182bd4ce..6a6351710d 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -223,22 +223,6 @@ static inline crypto_word_t value_barrier_w(crypto_word_t a) {
   return a;
 }
 
-// value_barrier_u32 behaves like |value_barrier_w| but takes a |uint32_t|.
-static inline uint32_t value_barrier_u32(uint32_t a) {
-#if defined(__GNUC__) || defined(__clang__)
-  __asm__("" : "+r"(a) : /* no inputs */);
-#endif
-  return a;
-}
-
-// value_barrier_u64 behaves like |value_barrier_w| but takes a |uint64_t|.
-static inline uint64_t value_barrier_u64(uint64_t a) {
-#if defined(__GNUC__) || defined(__clang__)
-  __asm__("" : "+r"(a) : /* no inputs */);
-#endif
-  return a;
-}
-
 // |value_barrier_u8| could be defined as above, but compilers other than
 // clang seem to still materialize 0x00..00MM instead of reusing 0x??..??MM.
 
diff --git a/crypto/perlasm/arm-xlate.pl b/crypto/perlasm/arm-xlate.pl
index d15a0cb108..aa2f20f348 100755
--- a/crypto/perlasm/arm-xlate.pl
+++ b/crypto/perlasm/arm-xlate.pl
@@ -153,9 +153,9 @@ sub expand_line {
 
 my ($arch_defines, $target_defines);
 if ($flavour =~ /32/) {
-    $arch_defines = "defined(__ARMEL__)";
+    $arch_defines = "defined(OPENSSL_ARM)";
 } elsif ($flavour =~ /64/) {
-    $arch_defines = "defined(__AARCH64EL__)";
+    $arch_defines = "defined(OPENSSL_AARCH64)";
 } else {
     die "unknown architecture: $flavour";
 }
@@ -177,18 +177,11 @@ sub expand_line {
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
-#if !defined(__has_feature)
-#define __has_feature(x) 0
-#endif
-#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
-#define OPENSSL_NO_ASM
-#endif
+#include <ring-core/asm_base.h>
 
 #if !defined(OPENSSL_NO_ASM) && $arch_defines && $target_defines
 ___
 
-print "#include \"ring_core_generated/prefix_symbols_asm.h\"\n";
-
 while(my $line=<>) {
 
     if ($line =~ m/^\s*(#|@|\/\/)/)	{ print $line; next; }
@@ -258,10 +251,6 @@ sub expand_line {
 
 print <<___;
 #endif  // !OPENSSL_NO_ASM && $arch_defines && $target_defines
-#if defined(__ELF__)
-// See https://www.airs.com/blog/archives/518.
-.section .note.GNU-stack,"",\%progbits
-#endif
 ___
 
 close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/perlasm/x86_64-xlate.pl b/crypto/perlasm/x86_64-xlate.pl
index 508cf4986a..044a379da9 100755
--- a/crypto/perlasm/x86_64-xlate.pl
+++ b/crypto/perlasm/x86_64-xlate.pl
@@ -1520,14 +1520,9 @@ sub rxb {
         die "unknown target: $flavour";
     }
     print <<___;
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
-#define OPENSSL_NO_ASM
-#endif
-#endif
-
-#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) && $target
-#include "ring_core_generated/prefix_symbols_asm.h"
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && $target
 ___
 }
 
@@ -1623,13 +1618,7 @@ sub process_line {
 if ($masm) {
     print "END\n";
 } elsif ($gas) {
-    print <<___;
-#endif
-#if defined(__ELF__)
-// See https://www.airs.com/blog/archives/518.
-.section .note.GNU-stack,"",\%progbits
-#endif
-___
+    print "#endif\n";
 } elsif ($nasm) {
     print <<___;
 \%else
diff --git a/crypto/perlasm/x86asm.pl b/crypto/perlasm/x86asm.pl
index f0abfb5181..59fc975f24 100644
--- a/crypto/perlasm/x86asm.pl
+++ b/crypto/perlasm/x86asm.pl
@@ -305,22 +305,13 @@ sub ::asm_finish
         }
 
         print <<___;
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
-#define OPENSSL_NO_ASM
-#endif
-#endif
-
-#if !defined(OPENSSL_NO_ASM) && defined(__i386__) && $target
-#include "ring_core_generated/prefix_symbols_asm.h"
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && $target
 ___
         print @out;
         print <<___;
-#endif  // !defined(OPENSSL_NO_ASM) && defined(__i386__) && $target
-#if defined(__ELF__)
-// See https://www.airs.com/blog/archives/518.
-.section .note.GNU-stack,"",\%progbits
-#endif
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && $target
 ___
     }
 }
diff --git a/crypto/poly1305/poly1305_arm_asm.S b/crypto/poly1305/poly1305_arm_asm.S
index 93f46e8111..df464d068d 100644
--- a/crypto/poly1305/poly1305_arm_asm.S
+++ b/crypto/poly1305/poly1305_arm_asm.S
@@ -1,15 +1,9 @@
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
-#define OPENSSL_NO_ASM
-#endif
-#endif
+#include <ring-core/asm_base.h>
 
-#if defined(__ARMEL__) && !defined(OPENSSL_NO_ASM) && defined(__ELF__)
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
 
 #pragma GCC diagnostic ignored "-Wlanguage-extension-token"
 
-#include "ring_core_generated/prefix_symbols_asm.h"
-
 # This implementation was taken from the public domain, neon2 version in
 # SUPERCOP by D. J. Bernstein and Peter Schwabe.
 
@@ -2022,8 +2016,4 @@ vst1.8 d4,[r0,: 64]
 add sp,sp,#0
 bx lr
 
-#endif  /* __ARMEL__ && !OPENSSL_NO_ASM && __ELF__ */
-
-#if defined(__ELF__)
-.section	.note.GNU-stack,"",%progbits
-#endif
+#endif  /* !OPENSSL_NO_ASM && OPENSSL_ARM && __ELF__ */
diff --git a/include/ring-core/arm_arch.h b/include/ring-core/arm_arch.h
index 77229ddc87..2fc0fc0421 100644
--- a/include/ring-core/arm_arch.h
+++ b/include/ring-core/arm_arch.h
@@ -53,12 +53,13 @@
 #ifndef OPENSSL_HEADER_ARM_ARCH_H
 #define OPENSSL_HEADER_ARM_ARCH_H
 
+#include <ring-core/target.h>
+
 // arm_arch.h contains symbols used by ARM assembly, and the C code that calls
 // it. It is included as a public header to simplify the build, but is not
 // intended for external use.
 
-#if defined(__ARMEL__) || defined(_M_ARM) || defined(__AARCH64EL__) || \
-    defined(_M_ARM64)
+#if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
 
 // ARMV7_NEON is true when a NEON unit is present in the current CPU.
 #define ARMV7_NEON (1 << 0)
@@ -91,124 +92,8 @@
 // will be included.
 #define __ARM_MAX_ARCH__ 8
 
-// Support macros for
-//   - Armv8.3-A Pointer Authentication and
-//   - Armv8.5-A Branch Target Identification
-// features which require emitting a .note.gnu.property section with the
-// appropriate architecture-dependent feature bits set.
-//
-// |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to
-// PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be
-// used immediately before saving the LR register (x30) to the stack.
-// |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring
-// it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone
-// with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also
-// have the same value at the two points. For example:
-//
-//   .global f
-//   f:
-//     AARCH64_SIGN_LINK_REGISTER
-//     stp x29, x30, [sp, #-96]!
-//     mov x29, sp
-//     ...
-//     ldp x29, x30, [sp], #96
-//     AARCH64_VALIDATE_LINK_REGISTER
-//     ret
-//
-// |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or
-// |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an
-// indirect call target. In particular, all symbols exported from a file must
-// begin with one of these macros. For example, a leaf function that does not
-// save LR can instead use |AARCH64_VALID_CALL_TARGET|:
-//
-//   .globl return_zero
-//   return_zero:
-//     AARCH64_VALID_CALL_TARGET
-//     mov x0, #0
-//     ret
-//
-// A non-leaf function which does not immediately save LR may need both macros
-// because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function
-// may jump to an alternate implementation before setting up the stack:
-//
-//   .globl with_early_jump
-//   with_early_jump:
-//     AARCH64_VALID_CALL_TARGET
-//     cmp x0, #128
-//     b.lt .Lwith_early_jump_128
-//     AARCH64_SIGN_LINK_REGISTER
-//     stp x29, x30, [sp, #-96]!
-//     mov x29, sp
-//     ...
-//     ldp x29, x30, [sp], #96
-//     AARCH64_VALIDATE_LINK_REGISTER
-//     ret
-//
-//  .Lwith_early_jump_128:
-//     ...
-//     ret
-//
-// These annotations are only required with indirect calls. Private symbols that
-// are only the target of direct calls do not require annotations. Also note
-// that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not
-// indirect jumps (BR). Indirect jumps in assembly are currently not supported
-// and would require a macro for BTI 'j'.
-//
-// Although not necessary, it is safe to use these macros in 32-bit ARM
-// assembly. This may be used to simplify dual 32-bit and 64-bit files.
-//
-// References:
-// - "ELF for the Arm® 64-bit Architecture"
-//   https://github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst
-// - "Providing protection for complex software"
-//   https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software
-
-#if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1
-#define GNU_PROPERTY_AARCH64_BTI (1 << 0)   // Has Branch Target Identification
-#define AARCH64_VALID_CALL_TARGET hint #34  // BTI 'c'
-#else
-#define GNU_PROPERTY_AARCH64_BTI 0  // No Branch Target Identification
-#define AARCH64_VALID_CALL_TARGET
-#endif
-
-#if defined(__ARM_FEATURE_PAC_DEFAULT) && \
-    (__ARM_FEATURE_PAC_DEFAULT & 1) == 1  // Signed with A-key
-#define GNU_PROPERTY_AARCH64_POINTER_AUTH \
-  (1 << 1)                                       // Has Pointer Authentication
-#define AARCH64_SIGN_LINK_REGISTER hint #25      // PACIASP
-#define AARCH64_VALIDATE_LINK_REGISTER hint #29  // AUTIASP
-#elif defined(__ARM_FEATURE_PAC_DEFAULT) && \
-    (__ARM_FEATURE_PAC_DEFAULT & 2) == 2  // Signed with B-key
-#define GNU_PROPERTY_AARCH64_POINTER_AUTH \
-  (1 << 1)                                       // Has Pointer Authentication
-#define AARCH64_SIGN_LINK_REGISTER hint #27      // PACIBSP
-#define AARCH64_VALIDATE_LINK_REGISTER hint #31  // AUTIBSP
-#else
-#define GNU_PROPERTY_AARCH64_POINTER_AUTH 0  // No Pointer Authentication
-#if GNU_PROPERTY_AARCH64_BTI != 0
-#define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET
-#else
-#define AARCH64_SIGN_LINK_REGISTER
-#endif
-#define AARCH64_VALIDATE_LINK_REGISTER
-#endif
-
-#if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0
-.pushsection .note.gnu.property, "a";
-.balign 8;
-.long 4;
-.long 0x10;
-.long 0x5;
-.asciz "GNU";
-.long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
-.long 4;
-.long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI);
-.long 0;
-.popsection;
-#endif
-
 #endif  // __ASSEMBLER__
 
-#endif  // __ARMEL__ || _M_ARM || __AARCH64EL__ || _M_ARM64
+#endif  // ARM || AARCH64
 
 #endif  // OPENSSL_HEADER_ARM_ARCH_H
diff --git a/include/ring-core/asm_base.h b/include/ring-core/asm_base.h
new file mode 100644
index 0000000000..c905e998c1
--- /dev/null
+++ b/include/ring-core/asm_base.h
@@ -0,0 +1,186 @@
+/* Copyright (c) 2023, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_ASM_BASE_H
+#define OPENSSL_HEADER_ASM_BASE_H
+
+#include <ring-core/target.h>
+
+
+// This header contains symbols and common sections used by assembly files. It
+// is included as a public header to simplify the build, but is not intended for
+// external use.
+//
+// Every assembly file must include this header. Some linker features require
+// all object files to be tagged with some section metadata. This header file,
+// when included in assembly, adds that metadata. It also makes defines like
+// |OPENSSL_X86_64| available and includes the prefixing macros.
+//
+// Including this header in an assembly file imples:
+//
+// - The file does not require an executable stack.
+//
+// - The file, on aarch64, uses the macros defined below to be compatible with
+//   BTI and PAC.
+
+#if defined(__ASSEMBLER__)
+
+#include <ring_core_generated/prefix_symbols_asm.h>
+
+#if defined(__ELF__)
+// Every ELF object file, even empty ones, should disable executable stacks. See
+// https://www.airs.com/blog/archives/518.
+.pushsection .note.GNU-stack, "", %progbits
+.popsection
+#endif
+
+#if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
+
+// We require the ARM assembler provide |__ARM_ARCH| from Arm C Language
+// Extensions (ACLE). This is supported in GCC 4.8+ and Clang 3.2+. MSVC does
+// not implement ACLE, but we require Clang's assembler on Windows.
+#if !defined(__ARM_ARCH)
+#error "ARM assembler must define __ARM_ARCH"
+#endif
+
+// __ARM_ARCH__ is used by OpenSSL assembly to determine the minimum target ARM
+// version.
+//
+// TODO(davidben): Switch the assembly to use |__ARM_ARCH| directly.
+#define __ARM_ARCH__ __ARM_ARCH
+
+// Even when building for 32-bit ARM, support for aarch64 crypto instructions
+// will be included.
+#define __ARM_MAX_ARCH__ 8
+
+// Support macros for
+//   - Armv8.3-A Pointer Authentication and
+//   - Armv8.5-A Branch Target Identification
+// features which require emitting a .note.gnu.property section with the
+// appropriate architecture-dependent feature bits set.
+//
+// |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to
+// PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be
+// used immediately before saving the LR register (x30) to the stack.
+// |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring
+// it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone
+// with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also
+// have the same value at the two points. For example:
+//
+//   .global f
+//   f:
+//     AARCH64_SIGN_LINK_REGISTER
+//     stp x29, x30, [sp, #-96]!
+//     mov x29, sp
+//     ...
+//     ldp x29, x30, [sp], #96
+//     AARCH64_VALIDATE_LINK_REGISTER
+//     ret
+//
+// |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or
+// |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an
+// indirect call target. In particular, all symbols exported from a file must
+// begin with one of these macros. For example, a leaf function that does not
+// save LR can instead use |AARCH64_VALID_CALL_TARGET|:
+//
+//   .globl return_zero
+//   return_zero:
+//     AARCH64_VALID_CALL_TARGET
+//     mov x0, #0
+//     ret
+//
+// A non-leaf function which does not immediately save LR may need both macros
+// because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function
+// may jump to an alternate implementation before setting up the stack:
+//
+//   .globl with_early_jump
+//   with_early_jump:
+//     AARCH64_VALID_CALL_TARGET
+//     cmp x0, #128
+//     b.lt .Lwith_early_jump_128
+//     AARCH64_SIGN_LINK_REGISTER
+//     stp x29, x30, [sp, #-96]!
+//     mov x29, sp
+//     ...
+//     ldp x29, x30, [sp], #96
+//     AARCH64_VALIDATE_LINK_REGISTER
+//     ret
+//
+//  .Lwith_early_jump_128:
+//     ...
+//     ret
+//
+// These annotations are only required with indirect calls. Private symbols that
+// are only the target of direct calls do not require annotations. Also note
+// that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not
+// indirect jumps (BR). Indirect jumps in assembly are currently not supported
+// and would require a macro for BTI 'j'.
+//
+// Although not necessary, it is safe to use these macros in 32-bit ARM
+// assembly. This may be used to simplify dual 32-bit and 64-bit files.
+//
+// References:
+// - "ELF for the Arm® 64-bit Architecture"
+//   https://github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst
+// - "Providing protection for complex software"
+//   https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software
+
+#if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1
+#define GNU_PROPERTY_AARCH64_BTI (1 << 0)   // Has Branch Target Identification
+#define AARCH64_VALID_CALL_TARGET hint #34  // BTI 'c'
+#else
+#define GNU_PROPERTY_AARCH64_BTI 0  // No Branch Target Identification
+#define AARCH64_VALID_CALL_TARGET
+#endif
+
+#if defined(__ARM_FEATURE_PAC_DEFAULT) && \
+    (__ARM_FEATURE_PAC_DEFAULT & 1) == 1  // Signed with A-key
+#define GNU_PROPERTY_AARCH64_POINTER_AUTH \
+  (1 << 1)                                       // Has Pointer Authentication
+#define AARCH64_SIGN_LINK_REGISTER hint #25      // PACIASP
+#define AARCH64_VALIDATE_LINK_REGISTER hint #29  // AUTIASP
+#elif defined(__ARM_FEATURE_PAC_DEFAULT) && \
+    (__ARM_FEATURE_PAC_DEFAULT & 2) == 2  // Signed with B-key
+#define GNU_PROPERTY_AARCH64_POINTER_AUTH \
+  (1 << 1)                                       // Has Pointer Authentication
+#define AARCH64_SIGN_LINK_REGISTER hint #27      // PACIBSP
+#define AARCH64_VALIDATE_LINK_REGISTER hint #31  // AUTIBSP
+#else
+#define GNU_PROPERTY_AARCH64_POINTER_AUTH 0  // No Pointer Authentication
+#if GNU_PROPERTY_AARCH64_BTI != 0
+#define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET
+#else
+#define AARCH64_SIGN_LINK_REGISTER
+#endif
+#define AARCH64_VALIDATE_LINK_REGISTER
+#endif
+
+#if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0
+.pushsection .note.gnu.property, "a";
+.balign 8;
+.long 4;
+.long 0x10;
+.long 0x5;
+.asciz "GNU";
+.long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
+.long 4;
+.long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI);
+.long 0;
+.popsection;
+#endif
+#endif  // ARM || AARCH64
+
+#endif  // __ASSEMBLER__
+
+#endif  // OPENSSL_HEADER_ASM_BASE_H
diff --git a/include/ring-core/base.h b/include/ring-core/base.h
index f1a027d1a4..938c5b8c57 100644
--- a/include/ring-core/base.h
+++ b/include/ring-core/base.h
@@ -56,10 +56,6 @@
 
 // This file should be the first included by all BoringSSL headers.
 
-#include <ring_core_generated/prefix_symbols.h>
-
-#include <ring-core/type_check.h>
-
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(push, 3)
 #endif
@@ -71,40 +67,25 @@
 #pragma warning(pop)
 #endif
 
-#if defined(__x86_64) || defined(_M_AMD64) || defined(_M_X64)
-#define OPENSSL_64_BIT
-#define OPENSSL_X86_64
-#elif defined(__x86) || defined(__i386) || defined(__i386__) || defined(_M_IX86)
-#define OPENSSL_32_BIT
-#define OPENSSL_X86
-#elif defined(__AARCH64EL__) || defined(_M_ARM64)
-#define OPENSSL_64_BIT
-#define OPENSSL_AARCH64
-#elif defined(__ARMEL__) || defined(_M_ARM)
-#define OPENSSL_32_BIT
-#define OPENSSL_ARM
-#elif defined(__MIPSEL__) && !defined(__LP64__)
-#define OPENSSL_32_BIT
-#define OPENSSL_MIPS
-#elif defined(__MIPSEL__) && defined(__LP64__)
-#define OPENSSL_64_BIT
-#define OPENSSL_MIPS64
-#elif defined(__wasm__)
-#define OPENSSL_32_BIT
-#else
-// Note BoringSSL only supports standard 32-bit and 64-bit two's-complement,
-// little-endian architectures. Functions will not produce the correct answer
-// on other systems. Run the crypto_test binary, notably
-// crypto/compiler_test.cc, before adding a new architecture.
-#error "Unknown target CPU"
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
 #endif
 
+#include <ring-core/target.h>  // IWYU pragma: export
+
+#include <ring_core_generated/prefix_symbols.h>
+
+#include <ring-core/type_check.h>
+
 #if defined(__APPLE__)
-#define OPENSSL_APPLE
+// Note |TARGET_OS_MAC| is set for all Apple OS variants. |TARGET_OS_OSX|
+// targets macOS specifically.
+#if defined(TARGET_OS_OSX) && TARGET_OS_OSX
+#define OPENSSL_MACOS
+#endif
+#if defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE
+#define OPENSSL_IOS
 #endif
-
-#if defined(_WIN32)
-#define OPENSSL_WINDOWS
 #endif
 
 // *ring* doesn't support the `BORINGSSL_SHARED_LIBRARY` configuration, so
diff --git a/include/ring-core/target.h b/include/ring-core/target.h
new file mode 100644
index 0000000000..0213d5c637
--- /dev/null
+++ b/include/ring-core/target.h
@@ -0,0 +1,139 @@
+/* Copyright (c) 2023, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_TARGET_H
+#define OPENSSL_HEADER_TARGET_H
+
+// Preprocessor symbols that define the target platform.
+//
+// This file may be included in C, C++, and assembler and must be compatible
+// with each environment. It is separated out only to share code between
+// <ring-core/base.h> and <ring-core/asm_base.h>. Prefer to include those headers
+// instead.
+
+#if defined(__x86_64) || defined(_M_AMD64) || defined(_M_X64)
+#define OPENSSL_64_BIT
+#define OPENSSL_X86_64
+#elif defined(__x86) || defined(__i386) || defined(__i386__) || defined(_M_IX86)
+#define OPENSSL_32_BIT
+#define OPENSSL_X86
+#elif defined(__AARCH64EL__) || defined(_M_ARM64)
+#define OPENSSL_64_BIT
+#define OPENSSL_AARCH64
+#elif defined(__ARMEL__) || defined(_M_ARM)
+#define OPENSSL_32_BIT
+#define OPENSSL_ARM
+#elif defined(__MIPSEL__) && !defined(__LP64__)
+#define OPENSSL_32_BIT
+#define OPENSSL_MIPS
+#elif defined(__MIPSEL__) && defined(__LP64__)
+#define OPENSSL_64_BIT
+#define OPENSSL_MIPS64
+#elif defined(__wasm__)
+#define OPENSSL_32_BIT
+#else
+// Note BoringSSL only supports standard 32-bit and 64-bit two's-complement,
+// little-endian architectures. Functions will not produce the correct answer
+// on other systems. Run the crypto_test binary, notably
+// crypto/compiler_test.cc, before adding a new architecture.
+#error "Unknown target CPU"
+#endif
+
+#if defined(__APPLE__)
+#define OPENSSL_APPLE
+#endif
+
+#if defined(_WIN32)
+#define OPENSSL_WINDOWS
+#endif
+
+// Trusty isn't Linux but currently defines __linux__. As a workaround, we
+// exclude it here.
+// TODO(b/169780122): Remove this workaround once Trusty no longer defines it.
+#if defined(__linux__) && !defined(__TRUSTY__)
+#define OPENSSL_LINUX
+#endif
+
+#if defined(__Fuchsia__)
+#define OPENSSL_FUCHSIA
+#endif
+
+#if defined(__TRUSTY__)
+#define OPENSSL_TRUSTY
+#define OPENSSL_NO_POSIX_IO
+#define OPENSSL_NO_SOCK
+#define OPENSSL_NO_THREADS_CORRUPT_MEMORY_AND_LEAK_SECRETS_IF_THREADED
+#endif
+
+#if defined(OPENSSL_NANOLIBC)
+#define OPENSSL_NO_POSIX_IO
+#define OPENSSL_NO_SOCK
+#define OPENSSL_NO_THREADS_CORRUPT_MEMORY_AND_LEAK_SECRETS_IF_THREADED
+#endif
+
+#if defined(__ANDROID_API__)
+#define OPENSSL_ANDROID
+#endif
+
+#if defined(__FreeBSD__)
+#define OPENSSL_FREEBSD
+#endif
+
+#if defined(__OpenBSD__)
+#define OPENSSL_OPENBSD
+#endif
+
+// BoringSSL requires platform's locking APIs to make internal global state
+// thread-safe, including the PRNG. On some single-threaded embedded platforms,
+// locking APIs may not exist, so this dependency may be disabled with the
+// following build flag.
+//
+// IMPORTANT: Doing so means the consumer promises the library will never be
+// used in any multi-threaded context. It causes BoringSSL to be globally
+// thread-unsafe. Setting it inappropriately will subtly and unpredictably
+// corrupt memory and leak secret keys.
+//
+// Do not set this flag on any platform where threads are possible. BoringSSL
+// maintainers will not provide support for any consumers that do so. Changes
+// which break such unsupported configurations will not be reverted.
+#if !defined(OPENSSL_NO_THREADS_CORRUPT_MEMORY_AND_LEAK_SECRETS_IF_THREADED)
+#define OPENSSL_THREADS
+#endif
+
+#if defined(BORINGSSL_UNSAFE_FUZZER_MODE) && \
+    !defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE)
+#define BORINGSSL_UNSAFE_DETERMINISTIC_MODE
+#endif
+
+#if defined(__has_feature)
+#if __has_feature(address_sanitizer)
+#define OPENSSL_ASAN
+#endif
+#if __has_feature(thread_sanitizer)
+#define OPENSSL_TSAN
+#endif
+#if __has_feature(memory_sanitizer)
+#define OPENSSL_MSAN
+#define OPENSSL_ASM_INCOMPATIBLE
+#endif
+#endif
+
+#if defined(OPENSSL_ASM_INCOMPATIBLE)
+#undef OPENSSL_ASM_INCOMPATIBLE
+#if !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif  // OPENSSL_ASM_INCOMPATIBLE
+
+#endif  // OPENSSL_HEADER_TARGET_H
diff --git a/include/ring-core/type_check.h b/include/ring-core/type_check.h
index d7e0393451..67df7bc687 100644
--- a/include/ring-core/type_check.h
+++ b/include/ring-core/type_check.h
@@ -71,5 +71,4 @@
 #define OPENSSL_STATIC_ASSERT(cond, msg) _Static_assert(cond, msg)
 #endif
 
-
 #endif  // OPENSSL_HEADER_TYPE_CHECK_H
diff --git a/third_party/fiat/README.md b/third_party/fiat/README.md
index 56accd45fa..9c1fc870bb 100644
--- a/third_party/fiat/README.md
+++ b/third_party/fiat/README.md
@@ -1,8 +1,23 @@
-# Fiat
+# Fiat Cryptography
 
-This directory contains code generated by
-[Fiat](https://github.com/mit-plv/fiat-crypto) and thus these files are
-licensed under the MIT license. (See LICENSE file.)
+The files in this directory are generated using [Fiat
+Cryptography](https://github.com/mit-plv/fiat-crypto) from the associated
+library of arithmetic-implementation templates. These files are included under
+the MIT license. (See LICENSE file.)
 
-The files are imported from the `fiat-c/src` directory of the Fiat repository.
-Their contents are `#include`d into source files, so we rename them to `.h`.
+Some files are included directly from the `fiat-c/src` directory of the Fiat
+Cryptography repository. Their contents are `#include`d into source files, so
+we rename them to `.h`. Implementations that use saturated arithmetic on 64-bit
+words are further manually edited to use platform-appropriate incantations for
+operations such as addition with carry; these changes are marked with "`NOTE:
+edited after generation`".
+
+# CryptOpt
+
+Files in the `asm` directory are compiled from Fiat-Cryptography templates
+using [CryptOpt](https://github.com/0xADE1A1DE/CryptOpt). These generated
+assembly files have been edited to support call-stack unwinding. The modified
+files have been checked for functional correctness using the CryptOpt
+translation validator that is included in the Fiat-Cryptography repository.
+Correct unwinding and manual assembler-directive changes related to object-file
+conventions are validated using unit tests.
diff --git a/third_party/fiat/asm/fiat_curve25519_adx_mul.S b/third_party/fiat/asm/fiat_curve25519_adx_mul.S
new file mode 100644
index 0000000000..f4c70dd41e
--- /dev/null
+++ b/third_party/fiat/asm/fiat_curve25519_adx_mul.S
@@ -0,0 +1,169 @@
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+    (defined(__APPLE__) || defined(__ELF__))
+
+.intel_syntax noprefix
+.text
+#if defined(__APPLE__)
+.private_extern _fiat_curve25519_adx_mul
+.global _fiat_curve25519_adx_mul
+_fiat_curve25519_adx_mul:
+#else
+.type fiat_curve25519_adx_mul, @function
+.hidden fiat_curve25519_adx_mul
+.global fiat_curve25519_adx_mul
+fiat_curve25519_adx_mul:
+#endif
+
+.cfi_startproc
+mov [rsp - 0x08], rbp
+.cfi_offset rbp, -8-0x08
+mov rbp, rsp
+
+mov rax, rdx
+mov rdx, [ rsi + 0x18 ]
+mulx r11, r10, [ rax + 0x8 ]
+mov rdx, [ rax + 0x0 ]
+mov [ rsp - 0x58 ], r15
+.cfi_offset r15, -8-0x58
+mulx r8, rcx, [ rsi + 0x18 ]
+mov rdx, [ rsi + 0x8 ]
+mov [ rsp - 0x80 ], rbx
+.cfi_offset rbx, -8-0x80
+mulx rbx, r9, [ rax + 0x18 ]
+mov rdx, [ rsi + 0x8 ]
+mov [ rsp - 0x70 ], r12
+.cfi_offset r12, -8-0x70
+mulx r15, r12, [ rax + 0x8 ]
+mov rdx, [ rsi + 0x0 ]
+mov [ rsp - 0x68 ], r13
+.cfi_offset r13, -8-0x68
+mov [ rsp - 0x60 ], r14
+.cfi_offset r14, -8-0x60
+mulx r14, r13, [ rax + 0x0 ]
+mov rdx, [ rax + 0x10 ]
+mov [ rsp - 0x18 ], r15
+mov [ rsp - 0x50 ], rdi
+mulx rdi, r15, [ rsi + 0x0 ]
+mov rdx, [ rax + 0x18 ]
+mov [ rsp - 0x48 ], r13
+mov [ rsp - 0x40 ], r9
+mulx r9, r13, [ rsi + 0x0 ]
+test al, al
+adox rcx, rdi
+mov rdx, [ rsi + 0x10 ]
+mov [ rsp - 0x38 ], r13
+mulx r13, rdi, [ rax + 0x8 ]
+adox r10, r9
+mov rdx, 0x0
+adox rbx, rdx
+adcx rdi, rcx
+adcx r8, r10
+mov r9, rdx
+adcx r9, rbx
+mov rdx, [ rsi + 0x10 ]
+mulx r10, rcx, [ rax + 0x0 ]
+mov rdx, [ rsi + 0x0 ]
+mov [ rsp - 0x30 ], r15
+mulx r15, rbx, [ rax + 0x8 ]
+mov rdx, -0x2
+inc rdx
+adox rcx, r15
+setc r15b
+clc
+adcx rcx, r12
+adox r10, rdi
+mov rdx, [ rax + 0x10 ]
+mov [ rsp - 0x78 ], rcx
+mulx rcx, rdi, [ rsi + 0x10 ]
+adox rdi, r8
+mov rdx, [ rax + 0x18 ]
+mov [ rsp - 0x28 ], rcx
+mulx rcx, r8, [ rsi + 0x10 ]
+mov rdx, [ rax + 0x10 ]
+mov [ rsp - 0x20 ], r8
+mulx r12, r8, [ rsi + 0x18 ]
+adox r8, r9
+mov rdx, [ rsi + 0x8 ]
+mov [ rsp - 0x10 ], r12
+mulx r12, r9, [ rax + 0x10 ]
+movzx rdx, r15b
+lea rdx, [ rdx + rcx ]
+adcx r9, r10
+adcx r13, rdi
+mov r15, 0x0
+mov r10, r15
+adox r10, rdx
+mov rdx, [ rax + 0x18 ]
+mulx rcx, rdi, [ rsi + 0x18 ]
+adox rcx, r15
+adcx r11, r8
+mov rdx, r15
+adcx rdx, r10
+adcx rcx, r15
+mov r8, rdx
+mov rdx, [ rax + 0x0 ]
+mulx r15, r10, [ rsi + 0x8 ]
+test al, al
+adox r10, r14
+adcx rbx, r10
+adox r15, [ rsp - 0x78 ]
+adcx r15, [ rsp - 0x30 ]
+adox r9, [ rsp - 0x18 ]
+adcx r9, [ rsp - 0x38 ]
+adox r13, [ rsp - 0x40 ]
+adcx r12, r13
+adox r11, [ rsp - 0x20 ]
+adcx r11, [ rsp - 0x28 ]
+mov rdx, 0x26
+mulx rsi, r14, r12
+adox rdi, r8
+adcx rdi, [ rsp - 0x10 ]
+mulx r10, r8, r11
+mov r13, 0x0
+adox rcx, r13
+adcx rcx, r13
+mulx r11, r12, rdi
+xor rdi, rdi
+adox r8, rbx
+adox r12, r15
+mulx rbx, r13, rcx
+adcx r14, [ rsp - 0x48 ]
+adox r13, r9
+adox rbx, rdi
+adcx rsi, r8
+adcx r10, r12
+adcx r11, r13
+adc rbx, 0x0
+mulx r9, r15, rbx
+xor r9, r9
+adox r15, r14
+mov rdi, r9
+adox rdi, rsi
+mov rcx, r9
+adox rcx, r10
+mov r8, [ rsp - 0x50 ]
+mov [ r8 + 0x8 ], rdi
+mov r12, r9
+adox r12, r11
+mov r14, r9
+cmovo r14, rdx
+mov [ r8 + 0x18 ], r12
+adcx r15, r14
+mov [ r8 + 0x0 ], r15
+mov [ r8 + 0x10 ], rcx
+mov rbx, [ rsp - 0x80 ]
+mov r12, [ rsp - 0x70 ]
+mov r13, [ rsp - 0x68 ]
+mov r14, [ rsp - 0x60 ]
+mov r15, [ rsp - 0x58 ]
+
+mov rbp, [rsp - 0x08]
+ret
+.cfi_endproc
+#if defined(__ELF__)
+.size fiat_curve25519_adx_mul, .-fiat_curve25519_adx_mul
+#endif
+
+#endif
diff --git a/third_party/fiat/asm/fiat_curve25519_adx_square.S b/third_party/fiat/asm/fiat_curve25519_adx_square.S
new file mode 100644
index 0000000000..9b1fdb9cf5
--- /dev/null
+++ b/third_party/fiat/asm/fiat_curve25519_adx_square.S
@@ -0,0 +1,137 @@
+#include <ring-core/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+    (defined(__APPLE__) || defined(__ELF__))
+
+.intel_syntax noprefix
+.text
+#if defined(__APPLE__)
+.private_extern _fiat_curve25519_adx_square
+.global _fiat_curve25519_adx_square
+_fiat_curve25519_adx_square:
+#else
+.type fiat_curve25519_adx_square, @function
+.hidden fiat_curve25519_adx_square
+.global fiat_curve25519_adx_square
+fiat_curve25519_adx_square:
+#endif
+
+.cfi_startproc
+mov [rsp - 0x08], rbp
+.cfi_offset rbp, -8-0x08
+mov rbp, rsp
+
+mov rdx, [ rsi + 0x0 ]
+mulx r10, rax, [ rsi + 0x8 ]
+mov rdx, [ rsi + 0x0 ]
+mulx rcx, r11, [ rsi + 0x10 ]
+xor rdx, rdx
+adox r11, r10
+mov rdx, [ rsi + 0x0 ]
+mulx r9, r8, [ rsi + 0x18 ]
+mov rdx, [ rsi + 0x8 ]
+mov [ rsp - 0x80 ], rbx
+.cfi_offset rbx, -8-0x80
+mulx rbx, r10, [ rsi + 0x18 ]
+adox r8, rcx
+mov [rsp - 0x48 ], rdi
+adox r10, r9
+adcx rax, rax
+mov rdx, [ rsi + 0x10 ]
+mulx r9, rcx, [ rsi + 0x18 ]
+adox rcx, rbx
+mov rdx, [ rsi + 0x10 ]
+mulx rdi, rbx, [ rsi + 0x8 ]
+mov rdx, 0x0
+adox r9, rdx
+mov [ rsp - 0x70 ], r12
+.cfi_offset r12, -8-0x70
+mov r12, -0x3
+inc r12
+adox rbx, r8
+adox rdi, r10
+adcx r11, r11
+mov r8, rdx
+adox r8, rcx
+mov r10, rdx
+adox r10, r9
+adcx rbx, rbx
+mov rdx, [ rsi + 0x0 ]
+mulx r9, rcx, rdx
+mov rdx, [ rsi + 0x8 ]
+mov [ rsp - 0x68 ], r13
+.cfi_offset r13, -8-0x68
+mov [ rsp - 0x60 ], r14
+.cfi_offset r14, -8-0x60
+mulx r14, r13, rdx
+seto dl
+inc r12
+adox r9, rax
+adox r13, r11
+adox r14, rbx
+adcx rdi, rdi
+mov al, dl
+mov rdx, [ rsi + 0x10 ]
+mulx rbx, r11, rdx
+adox r11, rdi
+adcx r8, r8
+adox rbx, r8
+adcx r10, r10
+movzx rdx, al
+mov rdi, 0x0
+adcx rdx, rdi
+movzx r8, al
+lea r8, [ r8 + rdx ]
+mov rdx, [ rsi + 0x18 ]
+mulx rdi, rax, rdx
+adox rax, r10
+mov rdx, 0x26
+mov [ rsp - 0x58 ], r15
+.cfi_offset r15, -8-0x58
+mulx r15, r10, r11
+clc
+adcx r10, rcx
+mulx r11, rcx, rbx
+adox r8, rdi
+mulx rdi, rbx, r8
+inc r12
+adox rcx, r9
+mulx r8, r9, rax
+adcx r15, rcx
+adox r9, r13
+adcx r11, r9
+adox rbx, r14
+adox rdi, r12
+adcx r8, rbx
+adc rdi, 0x0
+mulx r14, r13, rdi
+test al, al
+mov rdi, [ rsp - 0x48 ]
+adox r13, r10
+mov r14, r12
+adox r14, r15
+mov [ rdi + 0x8 ], r14
+mov rax, r12
+adox rax, r11
+mov r10, r12
+adox r10, r8
+mov [ rdi + 0x10 ], rax
+mov rcx, r12
+cmovo rcx, rdx
+adcx r13, rcx
+mov [ rdi + 0x0 ], r13
+mov [ rdi + 0x18 ], r10
+mov rbx, [ rsp - 0x80 ]
+mov r12, [ rsp - 0x70 ]
+mov r13, [ rsp - 0x68 ]
+mov r14, [ rsp - 0x60 ]
+mov r15, [ rsp - 0x58 ]
+
+mov rbp, [rsp - 0x08]
+ret
+.cfi_endproc
+#if defined(__ELF__)
+.size fiat_curve25519_adx_square, .-fiat_curve25519_adx_square
+#endif
+
+#endif
diff --git a/third_party/fiat/curve25519_64_adx.h b/third_party/fiat/curve25519_64_adx.h
new file mode 100644
index 0000000000..f50f5b8377
--- /dev/null
+++ b/third_party/fiat/curve25519_64_adx.h
@@ -0,0 +1,691 @@
+#include <stdbool.h>
+#include <stdint.h>
+#include <immintrin.h>
+#include <string.h>
+
+typedef uint64_t fe4[4];
+typedef uint8_t fiat_uint1;
+typedef int8_t fiat_int1;
+
+static __inline__ uint64_t fiat_value_barrier_u64(uint64_t a) {
+  __asm__("" : "+r"(a) : /* no inputs */);
+  return a;
+}
+
+__attribute__((target("adx,bmi2")))
+static inline void fe4_mul(fe4 out, const fe4 x, const fe4 y) { fiat_curve25519_adx_mul(out, x, y); }
+
+__attribute__((target("adx,bmi2")))
+static inline void fe4_sq(fe4 out, const fe4 x) { fiat_curve25519_adx_square(out, x); }
+
+/*
+ * The function fiat_mulx_u64 is a multiplication, returning the full double-width result.
+ *
+ * Postconditions:
+ *   out1 = (arg1 * arg2) mod 2^64
+ *   out2 = ⌊arg1 * arg2 / 2^64⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0xffffffffffffffff]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ *   out2: [0x0 ~> 0xffffffffffffffff]
+ */
+__attribute__((target("adx,bmi2")))
+static inline void fiat_mulx_u64(uint64_t* out1, uint64_t* out2, uint64_t arg1, uint64_t arg2) {
+// NOTE: edited after generation
+#if defined(_M_X64)
+  unsigned long long t;
+  *out1 = _umul128(arg1, arg2, &t);
+  *out2 = t;
+#elif defined(_M_ARM64)
+  *out1 = arg1 * arg2;
+  *out2 = __umulh(arg1, arg2);
+#else
+  unsigned __int128 t = (unsigned __int128)arg1 * arg2;
+  *out1 = t;
+  *out2 = (t >> 64);
+#endif
+}
+
+/*
+ * The function fiat_addcarryx_u64 is an addition with carry.
+ *
+ * Postconditions:
+ *   out1 = (arg1 + arg2 + arg3) mod 2^64
+ *   out2 = ⌊(arg1 + arg2 + arg3) / 2^64⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ *   arg3: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+__attribute__((target("adx,bmi2")))
+static inline void fiat_addcarryx_u64(uint64_t* out1, fiat_uint1* out2, fiat_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+// NOTE: edited after generation
+#if defined(__has_builtin)
+#  if __has_builtin(__builtin_ia32_addcarryx_u64)
+#    define addcarry64 __builtin_ia32_addcarryx_u64
+#  endif
+#endif
+#if defined(addcarry64)
+  long long unsigned int t;
+  *out2 = addcarry64(arg1, arg2, arg3, &t);
+  *out1 = t;
+#elif defined(_M_X64)
+  long long unsigned int t;
+  *out2 = _addcarry_u64(arg1, arg2, arg3, out1);
+  *out1 = t;
+#else
+  arg2 += arg1;
+  arg1 = arg2 < arg1;
+  uint64_t ret = arg2 + arg3;
+  arg1 += ret < arg2;
+  *out1 = ret;
+  *out2 = arg1;
+#endif
+#undef addcarry64
+}
+
+/*
+ * The function fiat_subborrowx_u64 is a subtraction with borrow.
+ *
+ * Postconditions:
+ *   out1 = (-arg1 + arg2 + -arg3) mod 2^64
+ *   out2 = -⌊(-arg1 + arg2 + -arg3) / 2^64⌋
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ *   arg3: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ *   out2: [0x0 ~> 0x1]
+ */
+__attribute__((target("adx,bmi2")))
+static inline void fiat_subborrowx_u64(uint64_t* out1, fiat_uint1* out2, fiat_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+#if defined(__has_builtin)
+#  if __has_builtin(__builtin_ia32_subborrow_u64)
+#    define subborrow64 __builtin_ia32_subborrow_u64
+#  endif
+#endif
+#if defined(subborrow64)
+  long long unsigned int t;
+  *out2 = subborrow64(arg1, arg2, arg3, &t);
+  *out1 = t;
+#elif defined(_M_X64)
+  long long unsigned int t;
+  *out2 = _subborrow_u64(arg1, arg2, arg3, &t); // NOTE: edited after generation
+  *out1 = t;
+#else
+  *out1 = arg2 - arg3 - arg1;
+  *out2 = (arg2 < arg3) | ((arg2 == arg3) & arg1);
+#endif
+#undef subborrow64
+}
+
+/*
+ * The function fiat_cmovznz_u64 is a single-word conditional move.
+ *
+ * Postconditions:
+ *   out1 = (if arg1 = 0 then arg2 else arg3)
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [0x0 ~> 0xffffffffffffffff]
+ *   arg3: [0x0 ~> 0xffffffffffffffff]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ */
+__attribute__((target("adx,bmi2")))
+static inline void fiat_cmovznz_u64(uint64_t* out1, fiat_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+  fiat_uint1 x1;
+  uint64_t x2;
+  uint64_t x3;
+  x1 = (!(!arg1));
+  x2 = ((fiat_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff));
+  x3 = ((fiat_value_barrier_u64(x2) & arg3) | (fiat_value_barrier_u64((~x2)) & arg2));
+  *out1 = x3;
+}
+
+/*
+ * Input Bounds:
+ *   arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+__attribute__((target("adx,bmi2")))
+static void fe4_add(uint64_t out1[4], const uint64_t arg1[4], const uint64_t arg2[4]) {
+  uint64_t x1;
+  fiat_uint1 x2;
+  uint64_t x3;
+  fiat_uint1 x4;
+  uint64_t x5;
+  fiat_uint1 x6;
+  uint64_t x7;
+  fiat_uint1 x8;
+  uint64_t x9;
+  uint64_t x10;
+  fiat_uint1 x11;
+  uint64_t x12;
+  fiat_uint1 x13;
+  uint64_t x14;
+  fiat_uint1 x15;
+  uint64_t x16;
+  fiat_uint1 x17;
+  uint64_t x18;
+  uint64_t x19;
+  fiat_uint1 x20;
+  fiat_addcarryx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0]));
+  fiat_addcarryx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1]));
+  fiat_addcarryx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2]));
+  fiat_addcarryx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3]));
+  fiat_cmovznz_u64(&x9, x8, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and
+  fiat_addcarryx_u64(&x10, &x11, 0x0, x1, x9);
+  fiat_addcarryx_u64(&x12, &x13, x11, x3, 0x0);
+  fiat_addcarryx_u64(&x14, &x15, x13, x5, 0x0);
+  fiat_addcarryx_u64(&x16, &x17, x15, x7, 0x0);
+  fiat_cmovznz_u64(&x18, x17, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and
+  fiat_addcarryx_u64(&x19, &x20, 0x0, x10, x18);
+  out1[0] = x19;
+  out1[1] = x12;
+  out1[2] = x14;
+  out1[3] = x16;
+}
+
+/*
+ * Input Bounds:
+ *   arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+__attribute__((target("adx,bmi2")))
+static void fe4_sub(uint64_t out1[4], const uint64_t arg1[4], const uint64_t arg2[4]) {
+  uint64_t x1;
+  uint64_t x2;
+  fiat_uint1 x3;
+  uint64_t x4;
+  uint64_t x5;
+  fiat_uint1 x6;
+  uint64_t x7;
+  uint64_t x8;
+  fiat_uint1 x9;
+  uint64_t x10;
+  uint64_t x11;
+  fiat_uint1 x12;
+  uint64_t x13;
+  uint64_t x14;
+  fiat_uint1 x15;
+  uint64_t x16;
+  fiat_uint1 x17;
+  uint64_t x18;
+  fiat_uint1 x19;
+  uint64_t x20;
+  fiat_uint1 x21;
+  uint64_t x22;
+  uint64_t x23;
+  fiat_uint1 x24;
+  x1 = (arg2[0]);
+  fiat_subborrowx_u64(&x2, &x3, 0x0, (arg1[0]), x1);
+  x4 = (arg2[1]);
+  fiat_subborrowx_u64(&x5, &x6, x3, (arg1[1]), x4);
+  x7 = (arg2[2]);
+  fiat_subborrowx_u64(&x8, &x9, x6, (arg1[2]), x7);
+  x10 = (arg2[3]);
+  fiat_subborrowx_u64(&x11, &x12, x9, (arg1[3]), x10);
+  fiat_cmovznz_u64(&x13, x12, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and
+  fiat_subborrowx_u64(&x14, &x15, 0x0, x2, x13);
+  fiat_subborrowx_u64(&x16, &x17, x15, x5, 0x0);
+  fiat_subborrowx_u64(&x18, &x19, x17, x8, 0x0);
+  fiat_subborrowx_u64(&x20, &x21, x19, x11, 0x0);
+  fiat_cmovznz_u64(&x22, x21, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and
+  fiat_subborrowx_u64(&x23, &x24, 0x0, x14, x22);
+  out1[0] = x23;
+  out1[1] = x16;
+  out1[2] = x18;
+  out1[3] = x20;
+}
+
+/*
+ * Input Bounds:
+ *   arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   arg2: [0x0 ~> 0x3ffffffffffffff] // NOTE: this is not any uint64!
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+__attribute__((target("adx,bmi2")))
+static void fe4_scmul(uint64_t out1[4], const uint64_t arg1[4], uint64_t arg2) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  fiat_uint1 x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  fiat_uint1 x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  fiat_uint1 x14;
+  uint64_t x15;
+  uint64_t x16;
+  uint64_t x17;
+  fiat_uint1 x18;
+  uint64_t x19;
+  fiat_uint1 x20;
+  uint64_t x21;
+  fiat_uint1 x22;
+  uint64_t x23;
+  fiat_uint1 x24;
+  uint64_t x25;
+  uint64_t x26;
+  fiat_uint1 x27;
+  fiat_mulx_u64(&x1, &x2, (arg1[0]), arg2);
+  fiat_mulx_u64(&x3, &x4, (arg1[1]), arg2);
+  fiat_addcarryx_u64(&x5, &x6, 0x0, x2, x3);
+  fiat_mulx_u64(&x7, &x8, (arg1[2]), arg2);
+  fiat_addcarryx_u64(&x9, &x10, x6, x4, x7);
+  fiat_mulx_u64(&x11, &x12, (arg1[3]), arg2);
+  fiat_addcarryx_u64(&x13, &x14, x10, x8, x11);
+  fiat_mulx_u64(&x15, &x16, (x12 + (uint64_t)x14), UINT8_C(0x26));
+  fiat_addcarryx_u64(&x17, &x18, 0x0, x1, x15);
+  fiat_addcarryx_u64(&x19, &x20, x18, x5, 0x0);
+  fiat_addcarryx_u64(&x21, &x22, x20, x9, 0x0);
+  fiat_addcarryx_u64(&x23, &x24, x22, x13, 0x0);
+  fiat_cmovznz_u64(&x25, x24, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and
+  fiat_addcarryx_u64(&x26, &x27, 0x0, x17, x25);
+  out1[0] = x26;
+  out1[1] = x19;
+  out1[2] = x21;
+  out1[3] = x23;
+}
+
+/*
+ * Input Bounds:
+ *   arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+__attribute__((target("adx,bmi2")))
+static void fe4_canon(uint64_t out1[4], const uint64_t arg1[4]) {
+  uint64_t x1;
+  fiat_uint1 x2;
+  uint64_t x3;
+  fiat_uint1 x4;
+  uint64_t x5;
+  fiat_uint1 x6;
+  uint64_t x7;
+  fiat_uint1 x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  fiat_uint1 x14;
+  uint64_t x15;
+  fiat_uint1 x16;
+  uint64_t x17;
+  fiat_uint1 x18;
+  uint64_t x19;
+  fiat_uint1 x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  fiat_subborrowx_u64(&x1, &x2, 0x0, (arg1[0]), UINT64_C(0xffffffffffffffed));
+  fiat_subborrowx_u64(&x3, &x4, x2, (arg1[1]), UINT64_C(0xffffffffffffffff));
+  fiat_subborrowx_u64(&x5, &x6, x4, (arg1[2]), UINT64_C(0xffffffffffffffff));
+  fiat_subborrowx_u64(&x7, &x8, x6, (arg1[3]), UINT64_C(0x7fffffffffffffff));
+  fiat_cmovznz_u64(&x9, x8, x1, (arg1[0]));
+  fiat_cmovznz_u64(&x10, x8, x3, (arg1[1]));
+  fiat_cmovznz_u64(&x11, x8, x5, (arg1[2]));
+  fiat_cmovznz_u64(&x12, x8, x7, (arg1[3]));
+  fiat_subborrowx_u64(&x13, &x14, 0x0, x9, UINT64_C(0xffffffffffffffed));
+  fiat_subborrowx_u64(&x15, &x16, x14, x10, UINT64_C(0xffffffffffffffff));
+  fiat_subborrowx_u64(&x17, &x18, x16, x11, UINT64_C(0xffffffffffffffff));
+  fiat_subborrowx_u64(&x19, &x20, x18, x12, UINT64_C(0x7fffffffffffffff));
+  fiat_cmovznz_u64(&x21, x20, x13, x9);
+  fiat_cmovznz_u64(&x22, x20, x15, x10);
+  fiat_cmovznz_u64(&x23, x20, x17, x11);
+  fiat_cmovznz_u64(&x24, x20, x19, x12);
+  out1[0] = x21;
+  out1[1] = x22;
+  out1[2] = x23;
+  out1[3] = x24;
+}
+
+/*
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0x1]
+ *   arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   arg3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   out2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+__attribute__((target("adx,bmi2")))
+static void fe4_cswap(uint64_t out1[4], uint64_t out2[4], fiat_uint1 arg1, const uint64_t arg2[4], const uint64_t arg3[4]) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  // NOTE: clang 14 for Zen 2 uses YMM registers
+  fiat_cmovznz_u64(&x1, arg1, (arg2[0]), (arg3[0]));
+  fiat_cmovznz_u64(&x2, arg1, (arg2[1]), (arg3[1]));
+  fiat_cmovznz_u64(&x3, arg1, (arg2[2]), (arg3[2]));
+  fiat_cmovznz_u64(&x4, arg1, (arg2[3]), (arg3[3]));
+  fiat_cmovznz_u64(&x5, arg1, (arg3[0]), (arg2[0]));
+  fiat_cmovznz_u64(&x6, arg1, (arg3[1]), (arg2[1]));
+  fiat_cmovznz_u64(&x7, arg1, (arg3[2]), (arg2[2]));
+  fiat_cmovznz_u64(&x8, arg1, (arg3[3]), (arg2[3]));
+  out1[0] = x1;
+  out1[1] = x2;
+  out1[2] = x3;
+  out1[3] = x4;
+  out2[0] = x5;
+  out2[1] = x6;
+  out2[2] = x7;
+  out2[3] = x8;
+}
+
+// The following functions are adaped from crypto/curve25519/curve25519.c
+// It would be desirable to share the code, but with the current field
+// implementations both 4-limb and 5-limb versions of the curve-level code need
+// to be included in builds targetting an unknown variant of x86_64.
+
+__attribute__((target("adx,bmi2")))
+static void fe4_invert(fe4 out, const fe4 z) {
+  fe4 t0;
+  fe4 t1;
+  fe4 t2;
+  fe4 t3;
+  int i;
+
+  fe4_sq(t0, z);
+  fe4_sq(t1, t0);
+  for (i = 1; i < 2; ++i) {
+    fe4_sq(t1, t1);
+  }
+  fe4_mul(t1, z, t1);
+  fe4_mul(t0, t0, t1);
+  fe4_sq(t2, t0);
+  fe4_mul(t1, t1, t2);
+  fe4_sq(t2, t1);
+  for (i = 1; i < 5; ++i) {
+    fe4_sq(t2, t2);
+  }
+  fe4_mul(t1, t2, t1);
+  fe4_sq(t2, t1);
+  for (i = 1; i < 10; ++i) {
+    fe4_sq(t2, t2);
+  }
+  fe4_mul(t2, t2, t1);
+  fe4_sq(t3, t2);
+  for (i = 1; i < 20; ++i) {
+    fe4_sq(t3, t3);
+  }
+  fe4_mul(t2, t3, t2);
+  fe4_sq(t2, t2);
+  for (i = 1; i < 10; ++i) {
+    fe4_sq(t2, t2);
+  }
+  fe4_mul(t1, t2, t1);
+  fe4_sq(t2, t1);
+  for (i = 1; i < 50; ++i) {
+    fe4_sq(t2, t2);
+  }
+  fe4_mul(t2, t2, t1);
+  fe4_sq(t3, t2);
+  for (i = 1; i < 100; ++i) {
+    fe4_sq(t3, t3);
+  }
+  fe4_mul(t2, t3, t2);
+  fe4_sq(t2, t2);
+  for (i = 1; i < 50; ++i) {
+    fe4_sq(t2, t2);
+  }
+  fe4_mul(t1, t2, t1);
+  fe4_sq(t1, t1);
+  for (i = 1; i < 5; ++i) {
+    fe4_sq(t1, t1);
+  }
+  fe4_mul(out, t1, t0);
+}
+
+__attribute__((target("adx,bmi2")))
+void x25519_scalar_mult_adx(uint8_t out[32], const uint8_t scalar[32],
+                            const uint8_t point[32]) {
+  uint8_t e[32];
+  memcpy(e, scalar, 32);
+  e[0] &= 248;
+  e[31] &= 127;
+  e[31] |= 64;
+
+  // The following implementation was transcribed to Coq and proven to
+  // correspond to unary scalar multiplication in affine coordinates given that
+  // x1 != 0 is the x coordinate of some point on the curve. It was also checked
+  // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2
+  // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the
+  // underlying field, so it applies to Curve25519 itself and the quadratic
+  // twist of Curve25519. It was not proven in Coq that prime-field arithmetic
+  // correctly simulates extension-field arithmetic on prime-field values.
+  // The decoding of the byte array representation of e was not considered.
+  // Specification of Montgomery curves in affine coordinates:
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
+  // Proof that these form a group that is isomorphic to a Weierstrass curve:
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
+  // Coq transcription and correctness proof of the loop (where scalarbits=255):
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
+  // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0
+  fe4 x1, x2 = {1}, z2 = {0}, x3, z3 = {1}, tmp0, tmp1;
+  OPENSSL_memcpy(x1, point, sizeof(fe4));
+  x1[3] &= (uint64_t)(-1)>>1;
+  OPENSSL_memcpy(x3, x1, sizeof(fe4));
+
+  unsigned swap = 0;
+  int pos;
+  for (pos = 254; pos >= 0; --pos) {
+    // loop invariant as of right before the test, for the case where x1 != 0:
+    //   pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero
+    //   let r := e >> (pos+1) in the following equalities of projective points:
+    //   to_xz (r*P)     === if swap then (x3, z3) else (x2, z2)
+    //   to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
+    //   x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P)
+    unsigned b = 1 & (e[pos / 8] >> (pos & 7));
+    swap ^= b;
+    fe4_cswap(x2, x3, swap, x2, x3);
+    fe4_cswap(z2, z3, swap, z2, z3);
+    swap = b;
+    // Coq transcription of ladderstep formula (called from transcribed loop):
+    // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
+    // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
+    // x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
+    // x1  = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
+    fe4_sub(tmp0, x3, z3);
+    fe4_sub(tmp1, x2, z2);
+    fe4_add(x2, x2, z2);
+    fe4_add(z2, x3, z3);
+    fe4_mul(z3, tmp0, x2);
+    fe4_mul(z2, z2, tmp1);
+    fe4_sq(tmp0, tmp1);
+    fe4_sq(tmp1, x2);
+    fe4_add(x3, z3, z2);
+    fe4_sub(z2, z3, z2);
+    fe4_mul(x2, tmp1, tmp0);
+    fe4_sub(tmp1, tmp1, tmp0);
+    fe4_sq(z2, z2);
+    fe4_scmul(z3, tmp1, 121666);
+    fe4_sq(x3, x3);
+    fe4_add(tmp0, tmp0, z3);
+    fe4_mul(z3, x1, z2);
+    fe4_mul(z2, tmp1, tmp0);
+  }
+  // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2)
+  fe4_cswap(x2, x3, swap, x2, x3);
+  fe4_cswap(z2, z3, swap, z2, z3);
+
+  fe4_invert(z2, z2);
+  fe4_mul(x2, x2, z2);
+  fe4_canon(x2, x2);
+  OPENSSL_memcpy(out, x2, sizeof(fe4));
+}
+
+typedef struct {
+  fe4 X;
+  fe4 Y;
+  fe4 Z;
+  fe4 T;
+} ge_p3_4;
+
+typedef struct {
+  fe4 yplusx;
+  fe4 yminusx;
+  fe4 xy2d;
+} ge_precomp_4;
+
+__attribute__((target("adx,bmi2")))
+static void inline_x25519_ge_dbl_4(ge_p3_4 *r, const ge_p3_4 *p, bool skip_t) {
+  // Transcribed from a Coq function proven against affine coordinates.
+  // https://github.com/mit-plv/fiat-crypto/blob/9943ba9e7d8f3e1c0054b2c94a5edca46ea73ef8/src/Curves/Edwards/XYZT/Basic.v#L136-L165
+  fe4 trX, trZ, trT, t0, cX, cY, cZ, cT;
+  fe4_sq(trX, p->X);
+  fe4_sq(trZ, p->Y);
+  fe4_sq(trT, p->Z);
+  fe4_add(trT, trT, trT);
+  fe4_add(cY, p->X, p->Y);
+  fe4_sq(t0, cY);
+  fe4_add(cY, trZ, trX);
+  fe4_sub(cZ, trZ, trX);
+  fe4_sub(cX, t0, cY);
+  fe4_sub(cT, trT, cZ);
+  fe4_mul(r->X, cX, cT);
+  fe4_mul(r->Y, cY, cZ);
+  fe4_mul(r->Z, cZ, cT);
+  if (!skip_t) {
+    fe4_mul(r->T, cX, cY);
+  }
+}
+
+__attribute__((target("adx,bmi2")))
+__attribute__((always_inline)) // 4% speedup with clang14 and zen2
+static inline void
+ge_p3_add_p3_precomp_4(ge_p3_4 *r, const ge_p3_4 *p, const ge_precomp_4 *q) {
+  fe4 A, B, C, YplusX, YminusX, D, X3, Y3, Z3, T3;
+  // Transcribed from a Coq function proven against affine coordinates.
+  // https://github.com/mit-plv/fiat-crypto/blob/a36568d1d73aff5d7accc79fd28be672882f9c17/src/Curves/Edwards/XYZT/Precomputed.v#L38-L56
+  fe4_add(YplusX, p->Y, p->X);
+  fe4_sub(YminusX, p->Y, p->X);
+  fe4_mul(A, YplusX, q->yplusx);
+  fe4_mul(B, YminusX, q->yminusx);
+  fe4_mul(C, q->xy2d, p->T);
+  fe4_add(D, p->Z, p->Z);
+  fe4_sub(X3, A, B);
+  fe4_add(Y3, A, B);
+  fe4_add(Z3, D, C);
+  fe4_sub(T3, D, C);
+  fe4_mul(r->X, X3, T3);
+  fe4_mul(r->Y, Y3, Z3);
+  fe4_mul(r->Z, Z3, T3);
+  fe4_mul(r->T, X3, Y3);
+}
+
+__attribute__((always_inline)) // 25% speedup with clang14 and zen2
+static inline void table_select_4(ge_precomp_4 *t, const int pos,
+                                  const signed char b) {
+  uint8_t bnegative = constant_time_msb_w(b);
+  uint8_t babs = b - ((bnegative & b) << 1);
+
+  uint8_t t_bytes[3][32] = {
+      {constant_time_is_zero_w(b) & 1}, {constant_time_is_zero_w(b) & 1}, {0}};
+#if defined(__clang__)
+  __asm__("" : "+m" (t_bytes) : /*no inputs*/);
+#endif
+  static_assert(sizeof(t_bytes) == sizeof(k25519Precomp[pos][0]), "");
+  for (int i = 0; i < 8; i++) {
+    constant_time_conditional_memxor(t_bytes, k25519Precomp[pos][i],
+                                     sizeof(t_bytes),
+                                     constant_time_eq_w(babs, 1 + i));
+  }
+
+  static_assert(sizeof(t_bytes) == sizeof(ge_precomp_4), "");
+
+  // fe4 uses saturated 64-bit limbs, so converting from bytes is just a copy.
+  OPENSSL_memcpy(t, t_bytes, sizeof(ge_precomp_4));
+
+  fe4 xy2d_neg = {0};
+  fe4_sub(xy2d_neg, xy2d_neg, t->xy2d);
+  constant_time_conditional_memcpy(t->yplusx, t_bytes[1], sizeof(fe4),
+                                   bnegative);
+  constant_time_conditional_memcpy(t->yminusx, t_bytes[0], sizeof(fe4),
+                                   bnegative);
+  constant_time_conditional_memcpy(t->xy2d, xy2d_neg, sizeof(fe4), bnegative);
+}
+
+// h = a * B
+// where a = a[0]+256*a[1]+...+256^31 a[31]
+// B is the Ed25519 base point (x,4/5) with x positive.
+//
+// Preconditions:
+//   a[31] <= 127
+__attribute__((target("adx,bmi2")))
+void x25519_ge_scalarmult_base_adx(uint8_t h[4][32], const uint8_t a[32]) {
+  signed char e[64];
+  signed char carry;
+
+  for (unsigned i = 0; i < 32; ++i) {
+    e[2 * i + 0] = (a[i] >> 0) & 15;
+    e[2 * i + 1] = (a[i] >> 4) & 15;
+  }
+  // each e[i] is between 0 and 15
+  // e[63] is between 0 and 7
+
+  carry = 0;
+  for (unsigned i = 0; i < 63; ++i) {
+    e[i] += carry;
+    carry = e[i] + 8;
+    carry >>= 4;
+    e[i] -= carry << 4;
+  }
+  e[63] += carry;
+  // each e[i] is between -8 and 8
+
+  ge_p3_4 r = {{0}, {1}, {1}, {0}};
+  for (unsigned i = 1; i < 64; i += 2) {
+    ge_precomp_4 t;
+    table_select_4(&t, i / 2, e[i]);
+    ge_p3_add_p3_precomp_4(&r, &r, &t);
+  }
+
+  inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/true);
+  inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/true);
+  inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/true);
+  inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/false);
+
+  for (unsigned i = 0; i < 64; i += 2) {
+    ge_precomp_4 t;
+    table_select_4(&t, i / 2, e[i]);
+    ge_p3_add_p3_precomp_4(&r, &r, &t);
+  }
+
+  // fe4 uses saturated 64-bit limbs, so converting to bytes is just a copy.
+  // Satisfy stated precondition of fiat_25519_from_bytes; tests pass either way
+  fe4_canon(r.X, r.X);
+  fe4_canon(r.Y, r.Y);
+  fe4_canon(r.Z, r.Z);
+  fe4_canon(r.T, r.T);
+  static_assert(sizeof(ge_p3_4) == sizeof(uint8_t[4][32]), "");
+  OPENSSL_memcpy(h, &r, sizeof(ge_p3_4));
+}