Merge pull request #1651 from briansmith/b/merge-boringssl-11

Merge BoringSSL through 27e45c4
briansmith · Sep 26, 2023 · b04bed1 · b04bed1
2 parents ad59665 + 0ae93f0
commit b04bed1
Show file tree

Hide file tree

Showing 5 changed files with 78 additions and 59 deletions.
diff --git a/crypto/fipsmodule/aes/aes_nohw.c b/crypto/fipsmodule/aes/aes_nohw.c
@@ -754,7 +754,7 @@ static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key,
 
 static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out,
                                        const AES_KEY *key) {
-  for (unsigned i = 0; i <= key->rounds; i++) {
+  for (size_t i = 0; i <= key->rounds; i++) {
     // Copy the round key into each block in the batch.
     for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) {
       aes_word_t tmp[AES_NOHW_BLOCK_WORDS];
@@ -921,8 +921,8 @@ void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
   uint32_t ctr = CRYPTO_load_u32_be(ivs + 12);
   for (;;) {
     // Update counters.
-    for (uint32_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
-      CRYPTO_store_u32_be(ivs + 16 * i + 12, ctr + i);
+    for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
+      CRYPTO_store_u32_be(ivs + 16 * i + 12, ctr + (uint32_t)i);
     }
 
     size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;

diff --git a/crypto/fipsmodule/bn/asm/x86_64-mont5.pl b/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
@@ -158,8 +158,12 @@
 	movdqa	%xmm1,%xmm2
 ___
 ########################################################################
-# calculate mask by comparing 0..31 to index and save result to stack
+# Calculate masks by comparing 0..31 to $idx and save result to stack.
 #
+# We compute sixteen 16-byte masks and store them on the stack. Mask i is stored
+# in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and
+# idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations
+# are scheduled in groups of four.
 $code.=<<___;
 	paddd	%xmm0,%xmm1
 	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
@@ -228,7 +232,8 @@
 }
 $code.=<<___;
 	por	%xmm1,%xmm0
-	pshufd	\$0x4e,%xmm0,%xmm1
+	# Combine the upper and lower halves of %xmm0.
+	pshufd	\$0x4e,%xmm0,%xmm1	# Swap upper and lower halves.
 	por	%xmm1,%xmm0
 	lea	$STRIDE($bp),$bp
 	movq	%xmm0,$m0		# m0=bp[0]
@@ -321,7 +326,8 @@
 }
 $code.=<<___;
 	por	%xmm5,%xmm4
-	pshufd	\$0x4e,%xmm4,%xmm0
+	# Combine the upper and lower halves of %xmm4 as %xmm0.
+	pshufd	\$0x4e,%xmm4,%xmm0	# Swap upper and lower halves.
 	por	%xmm4,%xmm0
 	lea	$STRIDE($bp),$bp
 
@@ -575,7 +581,6 @@
 ___
 		$bp="%r12";
 		$STRIDE=2**5*8;		# 5 is "window size"
-		$N=$STRIDE/4;		# should match cache line size
 		$tp=$i;
 $code.=<<___;
 	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
@@ -589,8 +594,12 @@
 	movdqa	%xmm1,%xmm2
 ___
 ########################################################################
-# calculate mask by comparing 0..31 to index and save result to stack
+# Calculate masks by comparing 0..31 to $idx and save result to stack.
 #
+# We compute sixteen 16-byte masks and store them on the stack. Mask i is stored
+# in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and
+# idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations
+# are scheduled in groups of four.
 $code.=<<___;
 	paddd	%xmm0,%xmm1
 	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
@@ -659,7 +668,8 @@
 }
 $code.=<<___;
 	por	%xmm1,%xmm0
-	pshufd	\$0x4e,%xmm0,%xmm1
+	# Combine the upper and lower halves of %xmm0.
+	pshufd	\$0x4e,%xmm0,%xmm1	# Swap upper and lower halves.
 	por	%xmm1,%xmm0
 	lea	$STRIDE($bp),$bp
 	movq	%xmm0,$m0		# m0=bp[0]
@@ -836,7 +846,8 @@
 }
 $code.=<<___;
 	por	%xmm5,%xmm4
-	pshufd	\$0x4e,%xmm4,%xmm0
+	# Combine the upper and lower halves of %xmm4 as %xmm0.
+	pshufd	\$0x4e,%xmm4,%xmm0	# Swap upper and lower halves.
 	por	%xmm4,%xmm0
 	lea	$STRIDE($bp),$bp
 	movq	%xmm0,$m0		# m0=bp[i]
@@ -2227,7 +2238,6 @@
    ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
 my $rptr=$bptr;
 my $STRIDE=2**5*8;		# 5 is "window size"
-my $N=$STRIDE/4;		# should match cache line size
 $code.=<<___;
 	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
 	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
@@ -2240,8 +2250,12 @@
 	movdqa	%xmm1,%xmm2
 ___
 ########################################################################
-# calculate mask by comparing 0..31 to index and save result to stack
+# Calculate masks by comparing 0..31 to $idx and save result to stack.
 #
+# We compute sixteen 16-byte masks and store them on the stack. Mask i is stored
+# in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and
+# idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations
+# are scheduled in groups of four.
 $code.=<<___;
 	.byte	0x67
 	paddd	%xmm0,%xmm1
@@ -2310,7 +2324,8 @@
 }
 $code.=<<___;
 	pxor	%xmm1,%xmm0
-	pshufd	\$0x4e,%xmm0,%xmm1
+	# Combine the upper and lower halves of %xmm0.
+	pshufd	\$0x4e,%xmm0,%xmm1	# Swap upper and lower halves.
 	por	%xmm1,%xmm0
 	lea	$STRIDE($bptr),$bptr
 	movq	%xmm0,%rdx		# bp[0]
@@ -2430,7 +2445,8 @@
 }
 $code.=<<___;
 	por	%xmm5,%xmm4
-	pshufd	\$0x4e,%xmm4,%xmm0
+	# Combine the upper and lower halves of %xmm4 as %xmm0.
+	pshufd	\$0x4e,%xmm4,%xmm0	# Swap upper and lower halves.
 	por	%xmm4,%xmm0
 	lea	$STRIDE($bptr),$bptr
 	movq	%xmm0,%rdx		# m0=bp[i]
@@ -3434,6 +3450,15 @@
 .cfi_startproc
 	cmp	\$0, $num
 	jz	.Lscatter_epilogue
+
+	# $tbl stores 32 entries, t0 through t31. Each entry has $num words.
+	# They are interleaved in memory as follows:
+	#
+	#  t0[0]      t1[0]      t2[0]      ... t31[0]
+	#  t0[1]      t1[1]      t2[1]      ... t31[1]
+	#  ...
+	#  t0[$num-1] t1[$num-1] t2[$num-1] ... t31[$num-1]
+
 	lea	($tbl,$idx,8),$tbl
 .Lscatter:
 	mov	($inp),%rax
@@ -3471,8 +3496,12 @@
 	movdqa	%xmm1,%xmm2
 ___
 ########################################################################
-# calculate mask by comparing 0..31 to $idx and save result to stack
+# Calculate masks by comparing 0..31 to $idx and save result to stack.
 #
+# We compute sixteen 16-byte masks and store them on the stack. Mask i is stored
+# in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and
+# idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations
+# are scheduled in groups of four.
 for($i=0;$i<$STRIDE/16;$i+=4) {
 $code.=<<___;
 	paddd	%xmm0,%xmm1
@@ -3510,6 +3539,8 @@
 	pxor	%xmm5,%xmm5
 ___
 for($i=0;$i<$STRIDE/16;$i+=4) {
+# Combine the masks with the corresponding table entries to select the correct
+# entry.
 $code.=<<___;
 	movdqa	`16*($i+0)-128`(%r11),%xmm0
 	movdqa	`16*($i+1)-128`(%r11),%xmm1
@@ -3528,7 +3559,8 @@
 $code.=<<___;
 	por	%xmm5,%xmm4
 	lea	$STRIDE(%r11),%r11
-	pshufd	\$0x4e,%xmm4,%xmm0
+	# Combine the upper and lower halves of %xmm0.
+	pshufd	\$0x4e,%xmm4,%xmm0	# Swap upper and lower halves.
 	por	%xmm4,%xmm0
 	movq	%xmm0,($out)		# m0=bp[0]
 	lea	8($out),$out

diff --git a/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl b/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl
@@ -1261,7 +1261,7 @@
 
 ////////////////////////////////////////////////////////////////////////
 // void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
-//                                int rep);
+//                                uint64_t rep);
 .globl	ecp_nistz256_ord_sqr_mont
 .type	ecp_nistz256_ord_sqr_mont,%function
 .align	4

diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c
@@ -29,17 +29,6 @@
 #pragma GCC diagnostic ignored "-Wconversion"
 #endif
 
-// We can assume little-endian.
-static uint32_t U8TO32_LE(const uint8_t *m) {
-  uint32_t r;
-  OPENSSL_memcpy(&r, m, sizeof(r));
-  return r;
-}
-
-static void U32TO8_LE(uint8_t *m, uint32_t v) {
-  OPENSSL_memcpy(m, &v, sizeof(v));
-}
-
 static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; }
 
 struct poly1305_state_st {
@@ -78,10 +67,10 @@ static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in,
   }
 
 poly1305_donna_16bytes:
-  t0 = U8TO32_LE(in);
-  t1 = U8TO32_LE(in + 4);
-  t2 = U8TO32_LE(in + 8);
-  t3 = U8TO32_LE(in + 12);
+  t0 = CRYPTO_load_u32_le(in);
+  t1 = CRYPTO_load_u32_le(in + 4);
+  t2 = CRYPTO_load_u32_le(in + 8);
+  t3 = CRYPTO_load_u32_le(in + 12);
 
   in += 16;
   len -= 16;
@@ -144,10 +133,10 @@ static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in,
   }
   len = 0;
 
-  t0 = U8TO32_LE(mp + 0);
-  t1 = U8TO32_LE(mp + 4);
-  t2 = U8TO32_LE(mp + 8);
-  t3 = U8TO32_LE(mp + 12);
+  t0 = CRYPTO_load_u32_le(mp + 0);
+  t1 = CRYPTO_load_u32_le(mp + 4);
+  t2 = CRYPTO_load_u32_le(mp + 8);
+  t3 = CRYPTO_load_u32_le(mp + 12);
 
   state->h0 += t0 & 0x3ffffff;
   state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff;
@@ -162,10 +151,10 @@ void CRYPTO_poly1305_init(poly1305_state *statep, const uint8_t key[32]) {
   struct poly1305_state_st *state = poly1305_aligned_state(statep);
   uint32_t t0, t1, t2, t3;
 
-  t0 = U8TO32_LE(key + 0);
-  t1 = U8TO32_LE(key + 4);
-  t2 = U8TO32_LE(key + 8);
-  t3 = U8TO32_LE(key + 12);
+  t0 = CRYPTO_load_u32_le(key + 0);
+  t1 = CRYPTO_load_u32_le(key + 4);
+  t2 = CRYPTO_load_u32_le(key + 8);
+  t3 = CRYPTO_load_u32_le(key + 12);
 
   // precompute multipliers
   state->r0 = t0 & 0x3ffffff;
@@ -241,7 +230,6 @@ void CRYPTO_poly1305_update(poly1305_state *statep, const uint8_t *in,
 
 void CRYPTO_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) {
   struct poly1305_state_st *state = poly1305_aligned_state(statep);
-  uint64_t f0, f1, f2, f3;
   uint32_t g0, g1, g2, g3, g4;
   uint32_t b, nb;
 
@@ -287,21 +275,22 @@ void CRYPTO_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) {
   state->h3 = (state->h3 & nb) | (g3 & b);
   state->h4 = (state->h4 & nb) | (g4 & b);
 
-  f0 = ((state->h0) | (state->h1 << 26)) + (uint64_t)U8TO32_LE(&state->key[0]);
-  f1 = ((state->h1 >> 6) | (state->h2 << 20)) +
-       (uint64_t)U8TO32_LE(&state->key[4]);
-  f2 = ((state->h2 >> 12) | (state->h3 << 14)) +
-       (uint64_t)U8TO32_LE(&state->key[8]);
-  f3 = ((state->h3 >> 18) | (state->h4 << 8)) +
-       (uint64_t)U8TO32_LE(&state->key[12]);
+  uint64_t f0 = ((state->h0) | (state->h1 << 26)) +
+                (uint64_t)CRYPTO_load_u32_le(&state->key[0]);
+  uint64_t f1 = ((state->h1 >> 6) | (state->h2 << 20)) +
+                (uint64_t)CRYPTO_load_u32_le(&state->key[4]);
+  uint64_t f2 = ((state->h2 >> 12) | (state->h3 << 14)) +
+                (uint64_t)CRYPTO_load_u32_le(&state->key[8]);
+  uint64_t f3 = ((state->h3 >> 18) | (state->h4 << 8)) +
+                (uint64_t)CRYPTO_load_u32_le(&state->key[12]);
 
-  U32TO8_LE(&mac[0], (uint32_t)f0);
+  CRYPTO_store_u32_le(&mac[0], (uint32_t)f0);
   f1 += (f0 >> 32);
-  U32TO8_LE(&mac[4], (uint32_t)f1);
+  CRYPTO_store_u32_le(&mac[4], (uint32_t)f1);
   f2 += (f1 >> 32);
-  U32TO8_LE(&mac[8], (uint32_t)f2);
+  CRYPTO_store_u32_le(&mac[8], (uint32_t)f2);
   f3 += (f2 >> 32);
-  U32TO8_LE(&mac[12], (uint32_t)f3);
+  CRYPTO_store_u32_le(&mac[12], (uint32_t)f3);
 }
 
 #endif  // !BORINGSSL_HAS_UINT128 || !OPENSSL_X86_64
diff --git a/src/arithmetic/bigint.rs b/src/arithmetic/bigint.rs
@@ -540,13 +540,11 @@ pub fn elem_exp_consttime<M>(
     let cpu_features = m.cpu_features();
 
     // The x86_64 assembly was written under the assumption that the input data
-    // is aligned to `MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH` bytes, which was/is
-    // 64 in OpenSSL. Similarly, OpenSSL uses the x86_64 assembly functions by
-    // giving it only inputs `tmp`, `am`, and `np` that immediately follow the
-    // table. The code seems to "work" even when the inputs aren't exactly
-    // like that but the side channel defenses might not be as effective. All
-    // the awkwardness here stems from trying to use the assembly code like
-    // OpenSSL does.
+    // is aligned to `MOD_EXP_CTIME_ALIGN` bytes, which was/is 64 in OpenSSL.
+    // Similarly, OpenSSL uses the x86_64 assembly functions by giving it only
+    // inputs `tmp`, `am`, and `np` that immediately follow the table. All the
+    // awkwardness here stems from trying to use the assembly code like OpenSSL
+    // does.
 
     use crate::limb::Window;