add bench/hist

camel-cdr · Nov 3, 2024 · 978b0ea · 978b0ea
1 parent 7f1e460
commit 978b0ea
Show file tree

Hide file tree

Showing 5 changed files with 192 additions and 1 deletion.
diff --git a/bench/Makefile b/bench/Makefile
@@ -2,7 +2,7 @@
 
 include ../config.mk
 
-EXECS=memcpy memset utf8_count strlen mergelines mandelbrot chacha20 poly1305 ascii_to_utf16 ascii_to_utf32 byteswap LUT4 LUT6
+EXECS=memcpy memset utf8_count strlen mergelines mandelbrot chacha20 poly1305 ascii_to_utf16 ascii_to_utf32 byteswap LUT4 LUT6 hist
 
 all: ${EXECS}
 

diff --git a/bench/bench.h b/bench/bench.h
@@ -40,6 +40,7 @@ compare_ux(void const *a, void const *b)
 
 static URand randState = { 123, 456, 789 };
 static ux bench_urand(void) { return urand(&randState); }
+static float bench_urandf(void) { return urandf(&randState); }
 static void bench_memrand(void *ptr, size_t n) { return memrand(&randState, ptr, n); }
 
 typedef struct {

diff --git a/bench/hist.S b/bench/hist.S
@@ -0,0 +1,118 @@
+#if 0
+
+void
+hist_rvv_no_conflict(uint16_t hist[100], float *x, float *y, size_t n)
+{
+	for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+		vl = __riscv_vsetvl_e32m8(n);
+		vfloat32m8_t vx = __riscv_vle32_v_f32m8(x, vl);
+		vfloat32m8_t vy = __riscv_vle32_v_f32m8(y, vl);
+		vfloat32m8_t vsq = __riscv_vfmacc(__riscv_vfmul(vx, vx, vl), vy, vy, vl);
+		vfloat32m8_t v = __riscv_vfsqrt(vsq, vl);
+		vuint16m4_t vidx =  __riscv_vminu(__riscv_vfncvt_rtz_xu(v, vl), 100, vl);
+		vidx = __riscv_vadd(vidx, vidx, vl);
+		vuint16m4_t vcnt =__riscv_vluxei16(hist, vidx, vl);
+		vcnt = __riscv_vadd(vcnt, 1, vl);
+		__riscv_vsuxei16(hist, vidx, vcnt, vl);
+	}
+}
+
+void
+hist_rvv_slidedown(uint16_t hist[100], float *x, float *y, size_t n)
+{
+	for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+		vl = __riscv_vsetvl_e32m8(n);
+		vfloat32m8_t vx = __riscv_vle32_v_f32m8(x, vl);
+		vfloat32m8_t vy = __riscv_vle32_v_f32m8(y, vl);
+		vfloat32m8_t vsq = __riscv_vfmacc(__riscv_vfmul(vx, vx, vl), vy, vy, vl);
+		vfloat32m8_t v = __riscv_vfsqrt(vsq, vl);
+		vuint16m4_t vidx =  __riscv_vminu(__riscv_vfncvt_rtz_xu(v, vl), 100, vl);
+
+		for (size_t i = 0; i < vl; ++i) {
+			size_t idx = __riscv_vmv_x(__riscv_vslidedown(vidx, i, 1));
+			++hist[idx];
+		}
+	}
+}
+#endif
+
+#ifdef MX
+
+.global MX(LUT4_rvv_vloxei8_)
+MX(LUT4_rvv_vloxei8_):
+1:
+	vsetvli a3, a2, e8, MX(), ta, ma
+	vle8.v v8, (a1)
+	vand.vi v8, v8, 15
+	vloxei8.v v8, (a0), v8
+	vse8.v v8, (a1)
+	sub a2, a2, a3
+	add a1, a1, a3
+	bnez a2, 1b
+	ret
+
+/* assumes no conflicts, which causes the wrong result */
+.global MX(hist_rvv_no_conflict_)
+MX(hist_rvv_no_conflict_):
+	li a4, 100
+1:
+	vsetvli a5, a3, e32, MX(), ta, ma
+	vle32.v v8, (a1)
+	vle32.v v16, (a2)
+	vfmul.vv v8, v8, v8
+	vfmacc.vv v8, v16, v16
+	vfsqrt.v v8, v8
+	vsetvli zero, zero, e16, MXf2(), ta, ma
+	vfncvt.rtz.xu.f.w v16, v8
+	vminu.vx v8, v16, a4
+	vadd.vv v8, v8, v8
+	vluxei16.v v12, (a0), v8
+	vadd.vi v12, v12, 1
+	vsuxei16.v v12, (a0), v8
+	sub a3, a3, a5
+	slli a5, a5, 2
+	add a1, a1, a5
+	add a2, a2, a5
+	bnez a3, 1b
+	ret
+
+.global MX(hist_rvv_slidedown_)
+MX(hist_rvv_slidedown_):
+	li a6, 100
+	j 2f
+1:
+	sub a3, a3, a7
+	slli a5, a7, 2
+	add a1, a1, a5
+	add a2, a2, a5
+	beqz a3, 4f
+2:
+	vsetvli a7, a3, e32, MX(), ta, ma
+	beqz a7, 1b
+	vle32.v v8, (a1)
+	vle32.v v16, (a2)
+	li a4, 0
+	vfmul.vv v8, v8, v8
+	vfmacc.vv v8, v16, v16
+	vfsqrt.v v8, v8
+	vsetvli zero, zero, e16, MXf2(), ta, ma
+	vfncvt.rtz.xu.f.w v16, v8
+	vminu.vx v8, v16, a6
+	vadd.vv v8, v8, v8
+	vsetivli zero, 1, e16, MXf2(), ta, ma
+3:
+	vslidedown.vx v12, v8, a4
+	vmv.x.s a5, v12
+	add t0, a0, a5
+	lh a5, 0(t0)
+	addi a5, a5, 1
+	addi a4, a4, 1
+	sh a5, 0(t0)
+	bne a7, a4, 3b
+	j 1b
+4:
+	ret
+
+#endif
+
+
diff --git a/bench/hist.c b/bench/hist.c
@@ -0,0 +1,66 @@
+#include "bench.h"
+
+#if __STDC_HOSTED__
+#include <math.h>
+#endif
+
+void
+hist_scalar(uint16_t hist[100], float *x, float *y, size_t n)
+{
+	for (size_t i = 0; i < n; ++i) {
+		float dist = x[i]*x[i] + y[i]*y[i];
+#if __STDC_HOSTED__
+		dist = sqrtf(dist);
+#else
+		__asm volatile("fsqrt.s %0, %0\n" : "+f"(dist));
+#endif
+		size_t idx = dist;
+		idx = idx > 100 ? 100 : dist;
+		++hist[idx];
+
+	}
+}
+
+#define IMPLS(f) \
+	f(scalar) \
+	MX(f, rvv_no_conflict) \
+	MX(f, rvv_slidedown) \
+
+typedef void Func(uint16_t hist[100], float *x, float *y, size_t n);
+
+#define DECLARE(f) extern Func hist_##f;
+IMPLS(DECLARE)
+
+#define EXTRACT(f) { #f, &hist_##f },
+Impl impls[] = { IMPLS(EXTRACT) };
+
+static uint16_t hist[100];
+float *inx, *iny;
+
+void init(void) {
+	inx = (float*)mem;
+	iny = (float*)(mem + MAX_MEM/2);
+}
+
+ux checksum(size_t n) {
+	ux sum = 0;
+	for (size_t i = 0; i < 100; ++i)
+		sum = hist[i];
+	return sum <= n; // sanity check for no_conflict
+}
+
+BENCH_BEG(base) {
+	n /= sizeof(float);
+	memset(hist, 0, sizeof hist);
+	float max = 70.71; // approx. sqrtf(100*100/2);
+	for (size_t i = 0; i < n; ++i) {
+		inx[i] = bench_urandf() * 2 * max - max;
+		iny[i] = bench_urandf() * 2 * max - max;
+	}
+	TIME f(hist, inx, iny, n);
+} BENCH_END
+
+Bench benches[] = {
+	BENCH( impls, MAX_MEM/2, "hist", bench_base)
+}; BENCH_MAIN(benches)
+
diff --git a/nolibc.h b/nolibc.h
@@ -281,6 +281,12 @@ urand(URand *r)
 	return xp;
 }
 
+static inline float
+urandf(URand *r) {
+	uint32_t x = urand(r);
+	return (x >> (32-24)) * (1.0f / (((uint32_t)1) << 24));
+}
+
 static void
 memrand(URand *r, void *ptr, size_t n)
 {