Skip to content

Commit

Permalink
Merge pull request #1651 from briansmith/b/merge-boringssl-11
Browse files Browse the repository at this point in the history
Merge BoringSSL through 27e45c4
  • Loading branch information
briansmith authored Sep 26, 2023
2 parents ad59665 + 0ae93f0 commit b04bed1
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 59 deletions.
6 changes: 3 additions & 3 deletions crypto/fipsmodule/aes/aes_nohw.c
Original file line number Diff line number Diff line change
Expand Up @@ -754,7 +754,7 @@ static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key,

static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out,
const AES_KEY *key) {
for (unsigned i = 0; i <= key->rounds; i++) {
for (size_t i = 0; i <= key->rounds; i++) {
// Copy the round key into each block in the batch.
for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) {
aes_word_t tmp[AES_NOHW_BLOCK_WORDS];
Expand Down Expand Up @@ -921,8 +921,8 @@ void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
uint32_t ctr = CRYPTO_load_u32_be(ivs + 12);
for (;;) {
// Update counters.
for (uint32_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
CRYPTO_store_u32_be(ivs + 16 * i + 12, ctr + i);
for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
CRYPTO_store_u32_be(ivs + 16 * i + 12, ctr + (uint32_t)i);
}

size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
Expand Down
58 changes: 45 additions & 13 deletions crypto/fipsmodule/bn/asm/x86_64-mont5.pl
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,12 @@
movdqa %xmm1,%xmm2
___
########################################################################
# calculate mask by comparing 0..31 to index and save result to stack
# Calculate masks by comparing 0..31 to $idx and save result to stack.
#
# We compute sixteen 16-byte masks and store them on the stack. Mask i is stored
# in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and
# idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations
# are scheduled in groups of four.
$code.=<<___;
paddd %xmm0,%xmm1
pcmpeqd %xmm5,%xmm0 # compare to 1,0
Expand Down Expand Up @@ -228,7 +232,8 @@
}
$code.=<<___;
por %xmm1,%xmm0
pshufd \$0x4e,%xmm0,%xmm1
# Combine the upper and lower halves of %xmm0.
pshufd \$0x4e,%xmm0,%xmm1 # Swap upper and lower halves.
por %xmm1,%xmm0
lea $STRIDE($bp),$bp
movq %xmm0,$m0 # m0=bp[0]
Expand Down Expand Up @@ -321,7 +326,8 @@
}
$code.=<<___;
por %xmm5,%xmm4
pshufd \$0x4e,%xmm4,%xmm0
# Combine the upper and lower halves of %xmm4 as %xmm0.
pshufd \$0x4e,%xmm4,%xmm0 # Swap upper and lower halves.
por %xmm4,%xmm0
lea $STRIDE($bp),$bp

Expand Down Expand Up @@ -575,7 +581,6 @@
___
$bp="%r12";
$STRIDE=2**5*8; # 5 is "window size"
$N=$STRIDE/4; # should match cache line size
$tp=$i;
$code.=<<___;
movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
Expand All @@ -589,8 +594,12 @@
movdqa %xmm1,%xmm2
___
########################################################################
# calculate mask by comparing 0..31 to index and save result to stack
# Calculate masks by comparing 0..31 to $idx and save result to stack.
#
# We compute sixteen 16-byte masks and store them on the stack. Mask i is stored
# in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and
# idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations
# are scheduled in groups of four.
$code.=<<___;
paddd %xmm0,%xmm1
pcmpeqd %xmm5,%xmm0 # compare to 1,0
Expand Down Expand Up @@ -659,7 +668,8 @@
}
$code.=<<___;
por %xmm1,%xmm0
pshufd \$0x4e,%xmm0,%xmm1
# Combine the upper and lower halves of %xmm0.
pshufd \$0x4e,%xmm0,%xmm1 # Swap upper and lower halves.
por %xmm1,%xmm0
lea $STRIDE($bp),$bp
movq %xmm0,$m0 # m0=bp[0]
Expand Down Expand Up @@ -836,7 +846,8 @@
}
$code.=<<___;
por %xmm5,%xmm4
pshufd \$0x4e,%xmm4,%xmm0
# Combine the upper and lower halves of %xmm4 as %xmm0.
pshufd \$0x4e,%xmm4,%xmm0 # Swap upper and lower halves.
por %xmm4,%xmm0
lea $STRIDE($bp),$bp
movq %xmm0,$m0 # m0=bp[i]
Expand Down Expand Up @@ -2227,7 +2238,6 @@
("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
my $rptr=$bptr;
my $STRIDE=2**5*8; # 5 is "window size"
my $N=$STRIDE/4; # should match cache line size
$code.=<<___;
movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
Expand All @@ -2240,8 +2250,12 @@
movdqa %xmm1,%xmm2
___
########################################################################
# calculate mask by comparing 0..31 to index and save result to stack
# Calculate masks by comparing 0..31 to $idx and save result to stack.
#
# We compute sixteen 16-byte masks and store them on the stack. Mask i is stored
# in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and
# idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations
# are scheduled in groups of four.
$code.=<<___;
.byte 0x67
paddd %xmm0,%xmm1
Expand Down Expand Up @@ -2310,7 +2324,8 @@
}
$code.=<<___;
pxor %xmm1,%xmm0
pshufd \$0x4e,%xmm0,%xmm1
# Combine the upper and lower halves of %xmm0.
pshufd \$0x4e,%xmm0,%xmm1 # Swap upper and lower halves.
por %xmm1,%xmm0
lea $STRIDE($bptr),$bptr
movq %xmm0,%rdx # bp[0]
Expand Down Expand Up @@ -2430,7 +2445,8 @@
}
$code.=<<___;
por %xmm5,%xmm4
pshufd \$0x4e,%xmm4,%xmm0
# Combine the upper and lower halves of %xmm4 as %xmm0.
pshufd \$0x4e,%xmm4,%xmm0 # Swap upper and lower halves.
por %xmm4,%xmm0
lea $STRIDE($bptr),$bptr
movq %xmm0,%rdx # m0=bp[i]
Expand Down Expand Up @@ -3434,6 +3450,15 @@
.cfi_startproc
cmp \$0, $num
jz .Lscatter_epilogue

# $tbl stores 32 entries, t0 through t31. Each entry has $num words.
# They are interleaved in memory as follows:
#
# t0[0] t1[0] t2[0] ... t31[0]
# t0[1] t1[1] t2[1] ... t31[1]
# ...
# t0[$num-1] t1[$num-1] t2[$num-1] ... t31[$num-1]

lea ($tbl,$idx,8),$tbl
.Lscatter:
mov ($inp),%rax
Expand Down Expand Up @@ -3471,8 +3496,12 @@
movdqa %xmm1,%xmm2
___
########################################################################
# calculate mask by comparing 0..31 to $idx and save result to stack
# Calculate masks by comparing 0..31 to $idx and save result to stack.
#
# We compute sixteen 16-byte masks and store them on the stack. Mask i is stored
# in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and
# idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations
# are scheduled in groups of four.
for($i=0;$i<$STRIDE/16;$i+=4) {
$code.=<<___;
paddd %xmm0,%xmm1
Expand Down Expand Up @@ -3510,6 +3539,8 @@
pxor %xmm5,%xmm5
___
for($i=0;$i<$STRIDE/16;$i+=4) {
# Combine the masks with the corresponding table entries to select the correct
# entry.
$code.=<<___;
movdqa `16*($i+0)-128`(%r11),%xmm0
movdqa `16*($i+1)-128`(%r11),%xmm1
Expand All @@ -3528,7 +3559,8 @@
$code.=<<___;
por %xmm5,%xmm4
lea $STRIDE(%r11),%r11
pshufd \$0x4e,%xmm4,%xmm0
# Combine the upper and lower halves of %xmm0.
pshufd \$0x4e,%xmm4,%xmm0 # Swap upper and lower halves.
por %xmm4,%xmm0
movq %xmm0,($out) # m0=bp[0]
lea 8($out),$out
Expand Down
2 changes: 1 addition & 1 deletion crypto/fipsmodule/ec/asm/p256-armv8-asm.pl
Original file line number Diff line number Diff line change
Expand Up @@ -1261,7 +1261,7 @@

////////////////////////////////////////////////////////////////////////
// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
// int rep);
// uint64_t rep);
.globl ecp_nistz256_ord_sqr_mont
.type ecp_nistz256_ord_sqr_mont,%function
.align 4
Expand Down
59 changes: 24 additions & 35 deletions crypto/poly1305/poly1305.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,6 @@
#pragma GCC diagnostic ignored "-Wconversion"
#endif

// We can assume little-endian.
static uint32_t U8TO32_LE(const uint8_t *m) {
uint32_t r;
OPENSSL_memcpy(&r, m, sizeof(r));
return r;
}

static void U32TO8_LE(uint8_t *m, uint32_t v) {
OPENSSL_memcpy(m, &v, sizeof(v));
}

static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; }

struct poly1305_state_st {
Expand Down Expand Up @@ -78,10 +67,10 @@ static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in,
}

poly1305_donna_16bytes:
t0 = U8TO32_LE(in);
t1 = U8TO32_LE(in + 4);
t2 = U8TO32_LE(in + 8);
t3 = U8TO32_LE(in + 12);
t0 = CRYPTO_load_u32_le(in);
t1 = CRYPTO_load_u32_le(in + 4);
t2 = CRYPTO_load_u32_le(in + 8);
t3 = CRYPTO_load_u32_le(in + 12);

in += 16;
len -= 16;
Expand Down Expand Up @@ -144,10 +133,10 @@ static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in,
}
len = 0;

t0 = U8TO32_LE(mp + 0);
t1 = U8TO32_LE(mp + 4);
t2 = U8TO32_LE(mp + 8);
t3 = U8TO32_LE(mp + 12);
t0 = CRYPTO_load_u32_le(mp + 0);
t1 = CRYPTO_load_u32_le(mp + 4);
t2 = CRYPTO_load_u32_le(mp + 8);
t3 = CRYPTO_load_u32_le(mp + 12);

state->h0 += t0 & 0x3ffffff;
state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff;
Expand All @@ -162,10 +151,10 @@ void CRYPTO_poly1305_init(poly1305_state *statep, const uint8_t key[32]) {
struct poly1305_state_st *state = poly1305_aligned_state(statep);
uint32_t t0, t1, t2, t3;

t0 = U8TO32_LE(key + 0);
t1 = U8TO32_LE(key + 4);
t2 = U8TO32_LE(key + 8);
t3 = U8TO32_LE(key + 12);
t0 = CRYPTO_load_u32_le(key + 0);
t1 = CRYPTO_load_u32_le(key + 4);
t2 = CRYPTO_load_u32_le(key + 8);
t3 = CRYPTO_load_u32_le(key + 12);

// precompute multipliers
state->r0 = t0 & 0x3ffffff;
Expand Down Expand Up @@ -241,7 +230,6 @@ void CRYPTO_poly1305_update(poly1305_state *statep, const uint8_t *in,

void CRYPTO_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) {
struct poly1305_state_st *state = poly1305_aligned_state(statep);
uint64_t f0, f1, f2, f3;
uint32_t g0, g1, g2, g3, g4;
uint32_t b, nb;

Expand Down Expand Up @@ -287,21 +275,22 @@ void CRYPTO_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) {
state->h3 = (state->h3 & nb) | (g3 & b);
state->h4 = (state->h4 & nb) | (g4 & b);

f0 = ((state->h0) | (state->h1 << 26)) + (uint64_t)U8TO32_LE(&state->key[0]);
f1 = ((state->h1 >> 6) | (state->h2 << 20)) +
(uint64_t)U8TO32_LE(&state->key[4]);
f2 = ((state->h2 >> 12) | (state->h3 << 14)) +
(uint64_t)U8TO32_LE(&state->key[8]);
f3 = ((state->h3 >> 18) | (state->h4 << 8)) +
(uint64_t)U8TO32_LE(&state->key[12]);
uint64_t f0 = ((state->h0) | (state->h1 << 26)) +
(uint64_t)CRYPTO_load_u32_le(&state->key[0]);
uint64_t f1 = ((state->h1 >> 6) | (state->h2 << 20)) +
(uint64_t)CRYPTO_load_u32_le(&state->key[4]);
uint64_t f2 = ((state->h2 >> 12) | (state->h3 << 14)) +
(uint64_t)CRYPTO_load_u32_le(&state->key[8]);
uint64_t f3 = ((state->h3 >> 18) | (state->h4 << 8)) +
(uint64_t)CRYPTO_load_u32_le(&state->key[12]);

U32TO8_LE(&mac[0], (uint32_t)f0);
CRYPTO_store_u32_le(&mac[0], (uint32_t)f0);
f1 += (f0 >> 32);
U32TO8_LE(&mac[4], (uint32_t)f1);
CRYPTO_store_u32_le(&mac[4], (uint32_t)f1);
f2 += (f1 >> 32);
U32TO8_LE(&mac[8], (uint32_t)f2);
CRYPTO_store_u32_le(&mac[8], (uint32_t)f2);
f3 += (f2 >> 32);
U32TO8_LE(&mac[12], (uint32_t)f3);
CRYPTO_store_u32_le(&mac[12], (uint32_t)f3);
}

#endif // !BORINGSSL_HAS_UINT128 || !OPENSSL_X86_64
12 changes: 5 additions & 7 deletions src/arithmetic/bigint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -540,13 +540,11 @@ pub fn elem_exp_consttime<M>(
let cpu_features = m.cpu_features();

// The x86_64 assembly was written under the assumption that the input data
// is aligned to `MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH` bytes, which was/is
// 64 in OpenSSL. Similarly, OpenSSL uses the x86_64 assembly functions by
// giving it only inputs `tmp`, `am`, and `np` that immediately follow the
// table. The code seems to "work" even when the inputs aren't exactly
// like that but the side channel defenses might not be as effective. All
// the awkwardness here stems from trying to use the assembly code like
// OpenSSL does.
// is aligned to `MOD_EXP_CTIME_ALIGN` bytes, which was/is 64 in OpenSSL.
// Similarly, OpenSSL uses the x86_64 assembly functions by giving it only
// inputs `tmp`, `am`, and `np` that immediately follow the table. All the
// awkwardness here stems from trying to use the assembly code like OpenSSL
// does.

use crate::limb::Window;

Expand Down

0 comments on commit b04bed1

Please sign in to comment.