Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
269 changes: 153 additions & 116 deletions src/day25.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,43 +5,43 @@ use super::*;
unsafe fn inner1(s: &[u8]) -> u32 {
static mut LOCKS: [u32; 250] = [0; 250];
static mut KEYS: [u32x8; 32] = [Simd::from_array([!0; 8]); 32];
static mut BUCKETS: [[u32; 250]; 6] = [[0; 250]; 6];

let locks = LOCKS.as_mut_ptr();
let keys = KEYS.as_mut_ptr();
let buckets = &mut BUCKETS;
let indices = &mut [0usize; 6];

asm!(
"jmp 20f",
"21:",
"mov [{locks}], {mask:e}",
"add {locks}, 4",
"vpcmpeqb {chunk}, {vec_ascii_hash}, [{ptr} + {i} - 43]",
"vpmovmskb {mask:e}, {chunk}",
"test {mask:l}, 1",
"jz 23f",
"22:",
"mov [{locks}], {mask:e}",
"add {locks}, 4",
"add {i:e}, -86",
"add {i:e}, -43",
"jl 30f",
"20:",
"vpcmpeqb {chunk}, {vec_ascii_hash}, [{ptr} + {i}]",
"vpmovmskb {mask:e}, {chunk}",
"test {mask:l}, 1",
"jnz 21b",
"mov [{keys}], {mask:e}",
"add {keys}, 4",
"vpcmpeqb {chunk}, {vec_ascii_hash}, [{ptr} + {i} - 43]",
"vpmovmskb {mask:e}, {chunk}",
"test {mask:l}, 1",
"jnz 22b",
"23:",
"mov [{keys}], {mask:e}",
"add {keys}, 4",
"add {i:e}, -86",
"andn {height:e}, {mask:e}, {col_mask:e}",
"popcnt {height:e}, {height:e}",
"imul {bucket:e}, {height:e}, 1000",
"lea {bucket}, [{buckets} + {bucket}]",
"mov {idx}, [{indices} + {height} * 8]",
"mov [{bucket} + {idx}], {mask:e}",
"add {idx}, 4",
"mov [{indices} + {height} * 8], {idx}",
"add {i:e}, -43",
"jge 20b",
"30:",
locks = inout(reg) locks => _,
keys = inout(reg) keys => _,
buckets = in(reg) buckets,
indices = in(reg) indices,
height = out(reg) _,
bucket = out(reg) _,
idx = out(reg) _,
col_mask = in(reg) 0b1000001000001000001000001000,
mask = out(reg) _,
i = inout(reg) 43usize * 499 + 3 => _,
ptr = in(reg) s.as_ptr(),
Expand All @@ -50,6 +50,32 @@ unsafe fn inner1(s: &[u8]) -> u32 {
options(nostack),
);

asm!(
"23:",
"mov {idx}, [{index}]",
"add {idx}, -4",
"jl 22f",
"20:",
"mov {tmp:e}, [{bucket} + {idx}]",
"mov [{keys}], {tmp:e}",
"add {keys}, 4",
"21:",
"add {idx}, -4",
"jge 20b",
"22:",
"add {bucket}, -1000",
"add {index}, -8",
"cmp {bucket}, {buckets}",
"jge 23b",
keys = inout(reg) keys => _,
index = inout(reg) indices.as_mut_ptr().add(5) => _,
buckets = in(reg) buckets.as_mut_ptr(),
bucket = inout(reg) buckets.as_mut_ptr().add(5) => _,
tmp = out(reg) _,
idx = out(reg) _,
options(nostack),
);

let mut sums = i32x8::splat(0);

asm!(
Expand All @@ -65,109 +91,120 @@ unsafe fn inner1(s: &[u8]) -> u32 {
"vmovdqa {cache10}, [rip + {keys}+288]",
"vmovdqa {cache11}, [rip + {keys}+320]",
"vmovdqa {cache12}, [rip + {keys}+352]",
"lea {reladdrs}, [rip + 200f]",
"mov {r2:e}, [{reladdrs}]",
"add {r2}, {reladdrs}",
"mov {r1}, [{table}]",
"mov [{table}], {r2}",
"mov {r2}, {r1}",
"shr {r2}, 5",
"mov {r2:e}, [{reladdrs} + {r2} * 4]",
"add {r2}, {reladdrs}",
"add {r1}, [{table} + 8]",
"mov [{table} + 8], {r2}",
"mov {r2}, {r1}",
"shr {r2}, 5",
"mov {r2:e}, [{reladdrs} + {r2} * 4]",
"add {r2}, {reladdrs}",
"add {r1}, [{table} + 16]",
"mov [{table} + 16], {r2}",
"mov {r2}, {r1}",
"shr {r2}, 5",
"mov {r2:e}, [{reladdrs} + {r2} * 4]",
"add {r2}, {reladdrs}",
"add {r1}, [{table} + 24]",
"mov [{table} + 24], {r2}",
"mov {r2}, {r1}",
"shr {r2}, 5",
"mov {r2:e}, [{reladdrs} + {r2} * 4]",
"add {r2}, {reladdrs}",
"add {r1}, [{table} + 32]",
"mov [{table} + 32], {r2}",
"shr {r1}, 5",
"mov {r1:e}, [{reladdrs} + {r1} * 4]",
"add {r1}, {reladdrs}",
"mov [{table} + 40], {r1}",
"20:",
"vpbroadcastd {lock}, [{locks} + {i}]",
"vpand {tmp}, {lock}, {cache1}",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, {cache2}",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, {cache3}",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, {cache4}",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, {cache5}",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, {cache6}",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, {cache7}",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, {cache8}",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, {cache9}",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, {cache10}",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, {cache11}",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, {cache12}",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+384]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+416]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+448]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+480]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+512]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+544]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+576]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+608]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+640]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+672]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+704]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+736]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+768]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+800]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+832]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+864]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+896]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+928]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+960]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"vpand {tmp}, {lock}, [rip + {keys}+992]",
"vpcmpeqd {tmp}, {tmp}, {vzero}",
"vpsubd {sums}, {sums}, {tmp}",
"andn {height:e}, {col_mask:e}, [{locks} + {i}]",
"popcnt {height:e}, {height:e}",
"jmp [{table} + {height} * 8]",
"200:",
".long 201f-200b",
".long 202f-200b",
".long 203f-200b",
".long 204f-200b",
".long 205f-200b",
".long 206f-200b",
".long 207f-200b",
".long 208f-200b",
".long 209f-200b",
".long 210f-200b",
".long 211f-200b",
".long 212f-200b",
".long 213f-200b",
".long 214f-200b",
".long 215f-200b",
".long 216f-200b",
".long 217f-200b",
".long 218f-200b",
".long 219f-200b",
".long 220f-200b",
".long 221f-200b",
".long 222f-200b",
".long 223f-200b",
".long 224f-200b",
".long 225f-200b",
".long 226f-200b",
".long 227f-200b",
".long 228f-200b",
".long 229f-200b",
".long 230f-200b",
".long 231f-200b",
".long 232f-200b",
"201:", "vpand {tmp}, {lock}, [rip + {keys}+992]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"202:", "vpand {tmp}, {lock}, [rip + {keys}+960]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"203:", "vpand {tmp}, {lock}, [rip + {keys}+928]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"204:", "vpand {tmp}, {lock}, [rip + {keys}+896]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"205:", "vpand {tmp}, {lock}, [rip + {keys}+864]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"206:", "vpand {tmp}, {lock}, [rip + {keys}+832]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"207:", "vpand {tmp}, {lock}, [rip + {keys}+800]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"208:", "vpand {tmp}, {lock}, [rip + {keys}+768]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"209:", "vpand {tmp}, {lock}, [rip + {keys}+736]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"210:", "vpand {tmp}, {lock}, [rip + {keys}+704]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"211:", "vpand {tmp}, {lock}, [rip + {keys}+672]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"212:", "vpand {tmp}, {lock}, [rip + {keys}+640]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"213:", "vpand {tmp}, {lock}, [rip + {keys}+608]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"214:", "vpand {tmp}, {lock}, [rip + {keys}+576]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"215:", "vpand {tmp}, {lock}, [rip + {keys}+544]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"216:", "vpand {tmp}, {lock}, [rip + {keys}+512]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"217:", "vpand {tmp}, {lock}, [rip + {keys}+480]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"218:", "vpand {tmp}, {lock}, [rip + {keys}+448]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"219:", "vpand {tmp}, {lock}, [rip + {keys}+416]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"220:", "vpand {tmp}, {lock}, [rip + {keys}+384]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"221:", "vpand {tmp}, {lock}, {cache12}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"222:", "vpand {tmp}, {lock}, {cache11}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"223:", "vpand {tmp}, {lock}, {cache10}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"224:", "vpand {tmp}, {lock}, {cache9}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"225:", "vpand {tmp}, {lock}, {cache8}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"226:", "vpand {tmp}, {lock}, {cache7}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"227:", "vpand {tmp}, {lock}, {cache6}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"228:", "vpand {tmp}, {lock}, {cache5}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"229:", "vpand {tmp}, {lock}, {cache4}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"230:", "vpand {tmp}, {lock}, {cache3}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"231:", "vpand {tmp}, {lock}, {cache2}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"232:", "vpand {tmp}, {lock}, {cache1}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}",
"add {i:e}, -4",
"jge 20b",
keys = sym KEYS,
locks = in(reg) locks,
i = inout(reg) 996usize => _,
height = out(reg) _,
col_mask = in(reg) !0b1000001000001000001000001000u32,
reladdrs = out(reg) _,
r1 = out(reg) _,
r2 = out(reg) _,
table = in(reg) indices,
vzero = in(ymm_reg) u32x8::splat(0),
tmp = out(ymm_reg) _,
lock = out(ymm_reg) _,
Expand All @@ -184,7 +221,7 @@ unsafe fn inner1(s: &[u8]) -> u32 {
cache10 = out(ymm_reg) _,
cache11 = out(ymm_reg) _,
cache12 = out(ymm_reg) _,
options(nostack, readonly),
options(nostack),
);

sums.reduce_sum() as u32
Expand Down
Loading