diff --git a/README.md b/README.md index 7e852f3..dd1d0c5 100644 --- a/README.md +++ b/README.md @@ -543,6 +543,74 @@ Benchmark 1: ./benchmarks/multibrot_set/multibrot\ Range (min … max): 134639.9 µs … 137621.4 µs 10 runs +### [Mojo Parallelized Mandelbrot Set](benchmarks/multibrot_set/multibrot_mojo_parallelize.mojo) + +```mojo +fn mandelbrot_kernel_SIMD[ + simd_width: Int +](c: ComplexSIMD[float_type, simd_width]) -> SIMD[float_type, simd_width]: + """A vectorized implementation of the inner mandelbrot computation.""" + let cx = c.re + let cy = c.im + var x = SIMD[float_type, simd_width](0) + var y = SIMD[float_type, simd_width](0) + var y2 = SIMD[float_type, simd_width](0) + var iters = SIMD[float_type, simd_width](0) + + var t: SIMD[DType.bool, simd_width] = True + for i in range(MAX_ITERS): + if not t.reduce_or(): + break + y2 = y * y + y = x.fma(y + y, cy) + t = x.fma(x, y2) <= 4 + x = x.fma(x, cx - y2) + iters = t.select(iters + 1, iters) + return iters + + +fn compute_multibrot_parallelized() -> Tensor[float_type]: + let t = Tensor[float_type](height, width) + + @parameter + fn worker(row: Int): + let scale_x = (max_x - min_x) / width + let scale_y = (max_y - min_y) / height + + @parameter + fn compute_vector[simd_width: Int](col: Int): + """Each time we operate on a `simd_width` vector of pixels.""" + let cx = min_x + (col + iota[float_type, simd_width]()) * scale_x + let cy = min_y + row * scale_y + let c = ComplexSIMD[float_type, simd_width](cx, cy) + t.data().simd_store[simd_width]( + row * width + col, mandelbrot_kernel_SIMD[simd_width](c) + ) + + # Vectorize the call to compute_vector where call gets a chunk of pixels. + vectorize[simd_width, compute_vector](width) + + # Parallelized + parallelize[worker](height, height) + return t + + +def main(): + _ = compute_multibrot_parallelized() +``` + +```shell +mojo build benchmarks/multibrot_set/multibrot_mojo_parallelize.mojo + +hyperfine --warmup 10 -r 10 --time-unit=microsecond --export-json benchmarks/multibrot_set/multibrot_mojo_parallelize.exe.json './benchmarks/multibrot_set/multibrot_mojo_parallelize' +``` + +**RESULT**:\ +Benchmark 1: ./benchmarks/multibrot_set/multibrot_mojo_parallelize\ + Time (mean ± σ): 7139.4 µs ± 596.4 µs [User: 36535.2 µs, System: 6670.1 µs]\ + Range (min … max): 6222.6 µs … 8269.7 µs 10 runs + + ### [Codon Mandelbrot Set](benchmarks/multibrot_set/multibrot.codon) ```codon def mandelbrot_kernel(c): @@ -623,14 +691,10 @@ Detailed one by one Places -1. Codon -2. Mojo -3. Python - -TODO: - -1) We are waiting for Mojo optimized version! -Like here [Mojo Mandelbrot](https://docs.modular.com/mojo/notebooks/Mandelbrot.html) +1. Mojo (parallelize) +2. Codon +3. Mojo +4. Python Links: diff --git a/benchmarks/multibrot_set/benchmarks.json b/benchmarks/multibrot_set/benchmarks.json index 54833af..49d5585 100644 --- a/benchmarks/multibrot_set/benchmarks.json +++ b/benchmarks/multibrot_set/benchmarks.json @@ -34,6 +34,40 @@ 0 ] }, + { + "command": "./benchmarks/multibrot_set/multibrot_mojo_parallelize", + "mean": 0.0071393926600000004, + "stddev": 0.000596373347825188, + "median": 0.00717595916, + "user": 0.036535200000000004, + "system": 0.006670119999999999, + "min": 0.0062225636599999995, + "max": 0.00826972966, + "times": [ + 0.00666552166, + 0.00710289666, + 0.00676443866, + 0.0062225636599999995, + 0.007547188660000001, + 0.007657730660000001, + 0.00724902166, + 0.0066388136599999995, + 0.00727602166, + 0.00826972966 + ], + "exit_codes": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + }, { "command": "./benchmarks/multibrot_set/multibrot_codon", "mean": 0.04418472508000001, diff --git a/benchmarks/multibrot_set/benchmarks.json.all.png b/benchmarks/multibrot_set/benchmarks.json.all.png index a72e9e4..4d982f1 100644 Binary files a/benchmarks/multibrot_set/benchmarks.json.all.png and b/benchmarks/multibrot_set/benchmarks.json.all.png differ diff --git a/benchmarks/multibrot_set/benchmarks.json.all2.png b/benchmarks/multibrot_set/benchmarks.json.all2.png index 9930b9a..7d9b68d 100644 Binary files a/benchmarks/multibrot_set/benchmarks.json.all2.png and b/benchmarks/multibrot_set/benchmarks.json.all2.png differ diff --git a/benchmarks/multibrot_set/benchmarks.json.combined.png b/benchmarks/multibrot_set/benchmarks.json.combined.png index ffc42e2..797a484 100644 Binary files a/benchmarks/multibrot_set/benchmarks.json.combined.png and b/benchmarks/multibrot_set/benchmarks.json.combined.png differ diff --git a/benchmarks/multibrot_set/benchmarks.json.md b/benchmarks/multibrot_set/benchmarks.json.md index 6b82c01..6a402ce 100644 --- a/benchmarks/multibrot_set/benchmarks.json.md +++ b/benchmarks/multibrot_set/benchmarks.json.md @@ -1,4 +1,16 @@ -1. Command './benchmarks/multibrot_set/multibrot_codon' +1 Command './benchmarks/multibrot_set/multibrot_mojo_parallelize' + runs: 10 + mean: 0.007 s + stddev: 0.001 s + median: 0.007 s + min: 0.006 s + max: 0.008 s + + percentiles: + P_05 .. P_95: 0.006 s .. 0.008 s + P_25 .. P_75: 0.007 s .. 0.007 s (IQR = 0.001 s) + +2 Command './benchmarks/multibrot_set/multibrot_codon' runs: 10 mean: 0.044 s stddev: 0.001 s @@ -10,7 +22,7 @@ P_05 .. P_95: 0.043 s .. 0.046 s P_25 .. P_75: 0.043 s .. 0.045 s (IQR = 0.001 s) -2. Command './benchmarks/multibrot_set/multibrot' (Mojo) +3 Command './benchmarks/multibrot_set/multibrot' runs: 10 mean: 0.136 s stddev: 0.001 s @@ -22,7 +34,7 @@ P_05 .. P_95: 0.135 s .. 0.138 s P_25 .. P_75: 0.135 s .. 0.137 s (IQR = 0.002 s) -3 Command 'python3 benchmarks/multibrot_set/__pycache__/multibrot.cpython-311.pyc' +4 Command 'python3 benchmarks/multibrot_set/__pycache__/multibrot.cpython-311.pyc' runs: 10 mean: 5.444 s stddev: 0.023 s diff --git a/benchmarks/multibrot_set/benchmarks.json.md.png b/benchmarks/multibrot_set/benchmarks.json.md.png index cb54f85..0b7818e 100644 Binary files a/benchmarks/multibrot_set/benchmarks.json.md.png and b/benchmarks/multibrot_set/benchmarks.json.md.png differ diff --git a/benchmarks/multibrot_set/multibrot_mojo_ parallelize.mojo.png b/benchmarks/multibrot_set/multibrot_mojo_ parallelize.mojo.png new file mode 100644 index 0000000..682e182 Binary files /dev/null and b/benchmarks/multibrot_set/multibrot_mojo_ parallelize.mojo.png differ diff --git a/benchmarks/multibrot_set/multibrot_mojo_parallelize b/benchmarks/multibrot_set/multibrot_mojo_parallelize new file mode 100755 index 0000000..d3c3802 Binary files /dev/null and b/benchmarks/multibrot_set/multibrot_mojo_parallelize differ diff --git a/benchmarks/multibrot_set/multibrot_mojo_parallelize.exe.json b/benchmarks/multibrot_set/multibrot_mojo_parallelize.exe.json new file mode 100644 index 0000000..23ad6fe --- /dev/null +++ b/benchmarks/multibrot_set/multibrot_mojo_parallelize.exe.json @@ -0,0 +1,38 @@ +{ + "results": [ + { + "command": "./benchmarks/multibrot_set/multibrot_mojo_parallelize", + "mean": 0.0071393926600000004, + "stddev": 0.000596373347825188, + "median": 0.00717595916, + "user": 0.036535200000000004, + "system": 0.006670119999999999, + "min": 0.0062225636599999995, + "max": 0.00826972966, + "times": [ + 0.00666552166, + 0.00710289666, + 0.00676443866, + 0.0062225636599999995, + 0.007547188660000001, + 0.007657730660000001, + 0.00724902166, + 0.0066388136599999995, + 0.00727602166, + 0.00826972966 + ], + "exit_codes": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + } + ] +} diff --git a/benchmarks/multibrot_set/multibrot_mojo_parallelize.mojo b/benchmarks/multibrot_set/multibrot_mojo_parallelize.mojo new file mode 100644 index 0000000..8127b76 --- /dev/null +++ b/benchmarks/multibrot_set/multibrot_mojo_parallelize.mojo @@ -0,0 +1,102 @@ +from complex import ComplexSIMD +from math import iota +from algorithm import parallelize, vectorize +from tensor import Tensor +from utils.index import Index + +alias float_type = DType.float64 +alias simd_width = 2 * simdwidthof[float_type]() + +alias width = 960 +alias height = 960 +alias MAX_ITERS = 200 + +alias min_x = -2.0 +alias max_x = 0.6 +alias min_y = -1.5 +alias max_y = 1.5 + + +fn mandelbrot_kernel_SIMD[ + simd_width: Int +](c: ComplexSIMD[float_type, simd_width]) -> SIMD[float_type, simd_width]: + """A vectorized implementation of the inner mandelbrot computation.""" + let cx = c.re + let cy = c.im + var x = SIMD[float_type, simd_width](0) + var y = SIMD[float_type, simd_width](0) + var y2 = SIMD[float_type, simd_width](0) + var iters = SIMD[float_type, simd_width](0) + + var t: SIMD[DType.bool, simd_width] = True + for i in range(MAX_ITERS): + if not t.reduce_or(): + break + y2 = y * y + y = x.fma(y + y, cy) + t = x.fma(x, y2) <= 4 + x = x.fma(x, cx - y2) + iters = t.select(iters + 1, iters) + return iters + + +fn compute_multibrot_parallelized() -> Tensor[float_type]: + let t = Tensor[float_type](height, width) + + @parameter + fn worker(row: Int): + let scale_x = (max_x - min_x) / width + let scale_y = (max_y - min_y) / height + + @parameter + fn compute_vector[simd_width: Int](col: Int): + """Each time we operate on a `simd_width` vector of pixels.""" + let cx = min_x + (col + iota[float_type, simd_width]()) * scale_x + let cy = min_y + row * scale_y + let c = ComplexSIMD[float_type, simd_width](cx, cy) + t.data().simd_store[simd_width]( + row * width + col, mandelbrot_kernel_SIMD[simd_width](c) + ) + + # Vectorize the call to compute_vector where call gets a chunk of pixels. + vectorize[simd_width, compute_vector](width) + + # Parallelized + parallelize[worker](height, height) + return t + + +def main(): + _ = compute_multibrot_parallelized() + + # let multibrot = compute_multibrot_parallelized() + # try: + # _ = show_plot(multibrot) + # except e: + # print("failed to show plot:", e) + + +def show_plot(tensor: Tensor[float_type]): + alias scale = 10 + alias dpi = 64 + + from python import Python + np = Python.import_module("numpy") + plt = Python.import_module("matplotlib.pyplot") + colors = Python.import_module("matplotlib.colors") + + numpy_array = np.zeros((height, width), np.float64) + + for row in range(height): + for col in range(width): + numpy_array.itemset((col, row), tensor[col, row]) + + fig = plt.figure(1, [scale, scale * height // width], dpi) + ax = fig.add_axes((0.0, 0.0, 1.0, 1.0)) + light = colors.LightSource(315, 10, 0, 1, 1, 0) + + image = light.shade(numpy_array, plt.cm.hot, colors.PowerNorm(0.3), "hsv", 0, 0, 1.5) + plt.imshow(image) + plt.axis("off") + plt.savefig("multibrot_mojo_ parallelize.mojo.png") + plt.show()