From 837b67df61197c9c6fc3759c4c56326a9fc12827 Mon Sep 17 00:00:00 2001 From: alion02 Date: Thu, 26 Dec 2024 04:33:50 +0100 Subject: [PATCH 1/3] wip --- src/day25.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/day25.rs b/src/day25.rs index 5421dd7..4dc0e4b 100644 --- a/src/day25.rs +++ b/src/day25.rs @@ -5,9 +5,11 @@ use super::*; unsafe fn inner1(s: &[u8]) -> u32 { static mut LOCKS: [u32; 250] = [0; 250]; static mut KEYS: [u32x8; 32] = [Simd::from_array([!0; 8]); 32]; + static mut BUCKETS: [[u32; 250]; 6] = [[0; 250]; 6]; let locks = LOCKS.as_mut_ptr(); let keys = KEYS.as_mut_ptr(); + let buf = &mut [0usize; 6]; asm!( "jmp 20f", @@ -28,6 +30,9 @@ unsafe fn inner1(s: &[u8]) -> u32 { "vpmovmskb {mask:e}, {chunk}", "test {mask:l}, 1", "jnz 21b", + "andn {height:e}, {col_mask:e}, {mask:e}", + "popcnt {height:e}, {height:e}", + "mov {tmp:e}, {height:e}", "mov [{keys}], {mask:e}", "add {keys}, 4", "vpcmpeqb {chunk}, {vec_ascii_hash}, [{ptr} + {i} - 43]", @@ -42,6 +47,8 @@ unsafe fn inner1(s: &[u8]) -> u32 { "30:", locks = inout(reg) locks => _, keys = inout(reg) keys => _, + // buckets = in(reg) &mut BUCKETS, + // buf = inout(reg) &mut BUCKETS => _, mask = out(reg) _, i = inout(reg) 43usize * 499 + 3 => _, ptr = in(reg) s.as_ptr(), From fc3d0c94d9786e58343df6e414d82664fd8916c6 Mon Sep 17 00:00:00 2001 From: alion02 Date: Thu, 26 Dec 2024 06:57:30 +0100 Subject: [PATCH 2/3] wip2 --- src/day25.rs | 67 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/src/day25.rs b/src/day25.rs index 4dc0e4b..1242b84 100644 --- a/src/day25.rs +++ b/src/day25.rs @@ -9,46 +9,39 @@ unsafe fn inner1(s: &[u8]) -> u32 { let locks = LOCKS.as_mut_ptr(); let keys = KEYS.as_mut_ptr(); - let buf = &mut [0usize; 6]; + let buckets = &mut BUCKETS; + let indices = &mut [0usize; 6]; asm!( "jmp 20f", "21:", "mov [{locks}], {mask:e}", "add {locks}, 4", - "vpcmpeqb {chunk}, {vec_ascii_hash}, [{ptr} + {i} - 43]", - "vpmovmskb {mask:e}, {chunk}", - "test {mask:l}, 1", - "jz 23f", - "22:", - "mov [{locks}], {mask:e}", - "add {locks}, 4", - "add {i:e}, -86", + "add {i:e}, -43", "jl 30f", "20:", "vpcmpeqb {chunk}, {vec_ascii_hash}, [{ptr} + {i}]", "vpmovmskb {mask:e}, {chunk}", "test {mask:l}, 1", "jnz 21b", - "andn {height:e}, {col_mask:e}, {mask:e}", + "andn {height:e}, {mask:e}, {col_mask:e}", "popcnt {height:e}, {height:e}", - "mov {tmp:e}, {height:e}", - "mov [{keys}], {mask:e}", - "add {keys}, 4", - "vpcmpeqb {chunk}, {vec_ascii_hash}, [{ptr} + {i} - 43]", - "vpmovmskb {mask:e}, {chunk}", - "test {mask:l}, 1", - "jnz 22b", - "23:", - "mov [{keys}], {mask:e}", - "add {keys}, 4", - "add {i:e}, -86", + "imul {bucket:e}, {height:e}, 1000", + "lea {bucket}, [{buckets} + {bucket}]", + "mov {idx}, [{indices} + {height} * 8]", + "mov [{bucket} + {idx}], {mask:e}", + "add {idx}, 4", + "mov [{indices} + {height} * 8], {idx}", + "add {i:e}, -43", "jge 20b", "30:", locks = inout(reg) locks => _, - keys = inout(reg) keys => _, - // buckets = in(reg) &mut BUCKETS, - // buf = inout(reg) &mut BUCKETS => _, + buckets = in(reg) buckets, + indices = in(reg) indices, + height = out(reg) _, + bucket = out(reg) _, + idx = out(reg) _, + col_mask = in(reg) 0b1000001000001000001000001000, mask = out(reg) _, i = inout(reg) 43usize * 499 + 3 => _, ptr = in(reg) s.as_ptr(), @@ -57,6 +50,32 @@ unsafe fn inner1(s: &[u8]) -> u32 { options(nostack), ); + asm!( + "23:", + "mov {idx}, [{index}]", + "add {idx}, -4", + "jl 22f", + "20:", + "mov {tmp:e}, [{bucket} + {idx}]", + "mov [{keys}], {tmp:e}", + "add {keys}, 4", + "21:", + "add {idx}, -4", + "jge 20b", + "22:", + "add {bucket}, -1000", + "add {index}, -8", + "cmp {bucket}, {buckets}", + "jge 23b", + keys = inout(reg) keys => _, + index = inout(reg) indices.as_mut_ptr().add(5) => _, + buckets = in(reg) buckets.as_mut_ptr(), + bucket = inout(reg) buckets.as_mut_ptr().add(5) => _, + tmp = out(reg) _, + idx = out(reg) _, + options(nostack), + ); + let mut sums = i32x8::splat(0); asm!( From b5269ddcde453e13a2586623cc916cc5e1b32d62 Mon Sep 17 00:00:00 2001 From: alion02 Date: Thu, 26 Dec 2024 08:53:49 +0100 Subject: [PATCH 3/3] cook up a jump table --- src/day25.rs | 205 +++++++++++++++++++++++++++------------------------ 1 file changed, 108 insertions(+), 97 deletions(-) diff --git a/src/day25.rs b/src/day25.rs index 1242b84..258753d 100644 --- a/src/day25.rs +++ b/src/day25.rs @@ -91,109 +91,120 @@ unsafe fn inner1(s: &[u8]) -> u32 { "vmovdqa {cache10}, [rip + {keys}+288]", "vmovdqa {cache11}, [rip + {keys}+320]", "vmovdqa {cache12}, [rip + {keys}+352]", + "lea {reladdrs}, [rip + 200f]", + "mov {r2:e}, [{reladdrs}]", + "add {r2}, {reladdrs}", + "mov {r1}, [{table}]", + "mov [{table}], {r2}", + "mov {r2}, {r1}", + "shr {r2}, 5", + "mov {r2:e}, [{reladdrs} + {r2} * 4]", + "add {r2}, {reladdrs}", + "add {r1}, [{table} + 8]", + "mov [{table} + 8], {r2}", + "mov {r2}, {r1}", + "shr {r2}, 5", + "mov {r2:e}, [{reladdrs} + {r2} * 4]", + "add {r2}, {reladdrs}", + "add {r1}, [{table} + 16]", + "mov [{table} + 16], {r2}", + "mov {r2}, {r1}", + "shr {r2}, 5", + "mov {r2:e}, [{reladdrs} + {r2} * 4]", + "add {r2}, {reladdrs}", + "add {r1}, [{table} + 24]", + "mov [{table} + 24], {r2}", + "mov {r2}, {r1}", + "shr {r2}, 5", + "mov {r2:e}, [{reladdrs} + {r2} * 4]", + "add {r2}, {reladdrs}", + "add {r1}, [{table} + 32]", + "mov [{table} + 32], {r2}", + "shr {r1}, 5", + "mov {r1:e}, [{reladdrs} + {r1} * 4]", + "add {r1}, {reladdrs}", + "mov [{table} + 40], {r1}", "20:", "vpbroadcastd {lock}, [{locks} + {i}]", - "vpand {tmp}, {lock}, {cache1}", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, {cache2}", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, {cache3}", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, {cache4}", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, {cache5}", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, {cache6}", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, {cache7}", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, {cache8}", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, {cache9}", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, {cache10}", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, {cache11}", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, {cache12}", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+384]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+416]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+448]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+480]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+512]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+544]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+576]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+608]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+640]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+672]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+704]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+736]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+768]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+800]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+832]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+864]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+896]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+928]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+960]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", - "vpand {tmp}, {lock}, [rip + {keys}+992]", - "vpcmpeqd {tmp}, {tmp}, {vzero}", - "vpsubd {sums}, {sums}, {tmp}", + "andn {height:e}, {col_mask:e}, [{locks} + {i}]", + "popcnt {height:e}, {height:e}", + "jmp [{table} + {height} * 8]", + "200:", + ".long 201f-200b", + ".long 202f-200b", + ".long 203f-200b", + ".long 204f-200b", + ".long 205f-200b", + ".long 206f-200b", + ".long 207f-200b", + ".long 208f-200b", + ".long 209f-200b", + ".long 210f-200b", + ".long 211f-200b", + ".long 212f-200b", + ".long 213f-200b", + ".long 214f-200b", + ".long 215f-200b", + ".long 216f-200b", + ".long 217f-200b", + ".long 218f-200b", + ".long 219f-200b", + ".long 220f-200b", + ".long 221f-200b", + ".long 222f-200b", + ".long 223f-200b", + ".long 224f-200b", + ".long 225f-200b", + ".long 226f-200b", + ".long 227f-200b", + ".long 228f-200b", + ".long 229f-200b", + ".long 230f-200b", + ".long 231f-200b", + ".long 232f-200b", + "201:", "vpand {tmp}, {lock}, [rip + {keys}+992]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "202:", "vpand {tmp}, {lock}, [rip + {keys}+960]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "203:", "vpand {tmp}, {lock}, [rip + {keys}+928]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "204:", "vpand {tmp}, {lock}, [rip + {keys}+896]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "205:", "vpand {tmp}, {lock}, [rip + {keys}+864]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "206:", "vpand {tmp}, {lock}, [rip + {keys}+832]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "207:", "vpand {tmp}, {lock}, [rip + {keys}+800]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "208:", "vpand {tmp}, {lock}, [rip + {keys}+768]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "209:", "vpand {tmp}, {lock}, [rip + {keys}+736]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "210:", "vpand {tmp}, {lock}, [rip + {keys}+704]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "211:", "vpand {tmp}, {lock}, [rip + {keys}+672]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "212:", "vpand {tmp}, {lock}, [rip + {keys}+640]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "213:", "vpand {tmp}, {lock}, [rip + {keys}+608]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "214:", "vpand {tmp}, {lock}, [rip + {keys}+576]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "215:", "vpand {tmp}, {lock}, [rip + {keys}+544]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "216:", "vpand {tmp}, {lock}, [rip + {keys}+512]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "217:", "vpand {tmp}, {lock}, [rip + {keys}+480]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "218:", "vpand {tmp}, {lock}, [rip + {keys}+448]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "219:", "vpand {tmp}, {lock}, [rip + {keys}+416]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "220:", "vpand {tmp}, {lock}, [rip + {keys}+384]", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "221:", "vpand {tmp}, {lock}, {cache12}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "222:", "vpand {tmp}, {lock}, {cache11}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "223:", "vpand {tmp}, {lock}, {cache10}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "224:", "vpand {tmp}, {lock}, {cache9}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "225:", "vpand {tmp}, {lock}, {cache8}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "226:", "vpand {tmp}, {lock}, {cache7}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "227:", "vpand {tmp}, {lock}, {cache6}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "228:", "vpand {tmp}, {lock}, {cache5}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "229:", "vpand {tmp}, {lock}, {cache4}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "230:", "vpand {tmp}, {lock}, {cache3}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "231:", "vpand {tmp}, {lock}, {cache2}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", + "232:", "vpand {tmp}, {lock}, {cache1}", "vpcmpeqd {tmp}, {tmp}, {vzero}", "vpsubd {sums}, {sums}, {tmp}", "add {i:e}, -4", "jge 20b", keys = sym KEYS, locks = in(reg) locks, i = inout(reg) 996usize => _, + height = out(reg) _, + col_mask = in(reg) !0b1000001000001000001000001000u32, + reladdrs = out(reg) _, + r1 = out(reg) _, + r2 = out(reg) _, + table = in(reg) indices, vzero = in(ymm_reg) u32x8::splat(0), tmp = out(ymm_reg) _, lock = out(ymm_reg) _, @@ -210,7 +221,7 @@ unsafe fn inner1(s: &[u8]) -> u32 { cache10 = out(ymm_reg) _, cache11 = out(ymm_reg) _, cache12 = out(ymm_reg) _, - options(nostack, readonly), + options(nostack), ); sums.reduce_sum() as u32