Mandelbrot and matmul improvements (modular#557)

jhatzics · Sep 5, 2023 · c30473a · c30473a
1 parent bdc57ad
commit c30473a
Show file tree

Hide file tree

Showing 3 changed files with 174 additions and 249 deletions.
diff --git a/examples/mandelbrot.mojo b/examples/mandelbrot.mojo
@@ -20,58 +20,6 @@ alias min_y = -1.5
 alias max_y = 1.5
 
 
-# Compute the number of steps to escape.
-def mandelbrot_kernel(c: ComplexFloat64) -> Int:
-    z = c
-    for i in range(MAX_ITERS):
-        z = z * z + c
-        if z.squared_norm() > 4:
-            return i
-    return MAX_ITERS
-
-
-def compute_mandelbrot() -> Tensor[float_type]:
-    # create a matrix. Each element of the matrix corresponds to a pixel
-    m = Tensor[float_type](width, height)
-
-    dx = (max_x - min_x) / width
-    dy = (max_y - min_y) / height
-
-    y = min_y
-    for j in range(height):
-        x = min_x
-        for i in range(width):
-            m[Index(i, j)] = mandelbrot_kernel(ComplexFloat64(x, y))
-            x += dx
-        y += dy
-    return m
-
-
-def show_plot(tensor: Tensor[float_type]):
-    np = Python.import_module("numpy")
-    plt = Python.import_module("matplotlib.pyplot")
-    colors = Python.import_module("matplotlib.colors")
-
-    numpy_array = np.zeros((height, width), np.float32)
-
-    for col in range(width):
-        for row in range(height):
-            numpy_array.itemset((row, col), tensor[col, row])
-
-    fig = plt.figure(1, [10, 10 * height // width], 64)
-    ax = fig.add_axes([0.0, 0.0, 1.0, 1.0], False, 1)
-    light = colors.LightSource(315, 10, 0, 1, 1, 0)
-
-    image = light.shade(
-        numpy_array, plt.cm.hot, colors.PowerNorm(0.3), "hsv", 0, 0, 1.5
-    )
-
-    plt.imshow(image)
-    plt.axis("off")
-    plt.savefig("out.png")
-    plt.show()
-
-
 fn mandelbrot_kernel_SIMD[
     simd_width: Int
 ](c: ComplexSIMD[float_type, simd_width]) -> SIMD[float_type, simd_width]:
@@ -90,123 +38,48 @@ fn mandelbrot_kernel_SIMD[
     return iters
 
 
-fn vectorized():
-    let m = Tensor[float_type](width, height)
+fn main():
+    let t = Tensor[float_type](height, width)
 
     @parameter
-    fn worker(col: Int):
+    fn worker(row: Int):
         let scale_x = (max_x - min_x) / width
         let scale_y = (max_y - min_y) / height
 
         @parameter
-        fn compute_vector[simd_width: Int](row: Int):
+        fn compute_vector[simd_width: Int](col: Int):
             """Each time we oeprate on a `simd_width` vector of pixels."""
-            let cy = min_y + (row + iota[float_type, simd_width]()) * scale_y
-            let cx = min_x + col * scale_x
+            let cx = min_x + (col + iota[float_type, simd_width]()) * scale_x
+            let cy = min_y + row * scale_y
             let c = ComplexSIMD[float_type, simd_width](cx, cy)
-            m._ptr.simd_store[simd_width](
-                col * height + row, mandelbrot_kernel_SIMD[simd_width](c)
+            t.data().simd_store[simd_width](
+                row * width + col, mandelbrot_kernel_SIMD[simd_width](c)
             )
 
         # Vectorize the call to compute_vector where call gets a chunk of pixels.
         vectorize[simd_width, compute_vector](width)
 
     @parameter
     fn bench[simd_width: Int]():
-        for col in range(width):
-            worker(col)
-
-    let vectorized = Benchmark().run[bench[simd_width]]() / 1e6
-    print("Vectorized", ":", vectorized, "ms")
-
-    try:
-        _ = show_plot(m)
-    except e:
-        print("failed to show plot:", e.value)
-
-
-fn parallelized():
-    let m = Tensor[float_type](width, height)
-
-    @parameter
-    fn worker(col: Int):
-        let scale_x = (max_x - min_x) / width
-        let scale_y = (max_y - min_y) / height
-
-        @parameter
-        fn compute_vector[simd_width: Int](row: Int):
-            """Each time we oeprate on a `simd_width` vector of pixels."""
-            let cy = min_y + (row + iota[float_type, simd_width]()) * scale_y
-            let cx = min_x + col * scale_x
-            let c = ComplexSIMD[float_type, simd_width](cx, cy)
-            m._ptr.simd_store[simd_width](
-                col * height + row, mandelbrot_kernel_SIMD[simd_width](c)
-            )
-
-        # Vectorize the call to compute_vector where call gets a chunk of pixels.
-        vectorize[simd_width, compute_vector](width)
-
-    with Runtime() as rt:
-
-        @parameter
-        fn bench_parallel[simd_width: Int]():
-            parallelize[worker](rt, width, 5 * num_cores())
-
-        alias simd_width = simdwidthof[DType.float64]()
-        let parallelized = Benchmark().run[bench_parallel[simd_width]]() / 1e6
-        print("Parallelized:", parallelized, "ms")
-
-    try:
-        _ = show_plot(m)
-    except e:
-        print("failed to show plot:", e.value)
-
-
-fn compare():
-    let m = Tensor[float_type](width, height)
-
-    @parameter
-    fn worker(col: Int):
-        let scale_x = (max_x - min_x) / width
-        let scale_y = (max_y - min_y) / height
-
-        @parameter
-        fn compute_vector[simd_width: Int](row: Int):
-            """Each time we oeprate on a `simd_width` vector of pixels."""
-            let cy = min_y + (row + iota[float_type, simd_width]()) * scale_y
-            let cx = min_x + col * scale_x
-            let c = ComplexSIMD[float_type, simd_width](cx, cy)
-            m._ptr.simd_store[simd_width](
-                col * height + row, mandelbrot_kernel_SIMD[simd_width](c)
-            )
-
-        # Vectorize the call to compute_vector where call gets a chunk of pixels.
-        vectorize[simd_width, compute_vector](width)
-
-    # Vectorized
-    @parameter
-    fn bench[simd_width: Int]():
-        for col in range(width):
-            worker(col)
+        for row in range(height):
+            worker(row)
 
-    let vectorized = Benchmark().run[bench[simd_width]]() / 1e6
+    let vectorized_ms = Benchmark().run[bench[simd_width]]() / 1e6
     print("Number of hardware cores:", num_cores())
-    print("Vectorized:", vectorized, "ms")
+    print("Vectorized:", vectorized_ms, "ms")
 
     # Parallelized
     with Runtime() as rt:
 
         @parameter
         fn bench_parallel[simd_width: Int]():
-            parallelize[worker](rt, width, 5 * num_cores())
+            parallelize[worker](rt, height, 5 * num_cores())
 
         alias simd_width = simdwidthof[DType.float64]()
-        let parallelized = Benchmark().run[bench_parallel[simd_width]]() / 1e6
-        print("Parallelized:", parallelized, "ms")
-        print("Parallel speedup:", vectorized / parallelized)
-
-    _ = m
-
+        let parallelized_ms = Benchmark().run[
+            bench_parallel[simd_width]
+        ]() / 1e6
+        print("Parallelized:", parallelized_ms, "ms")
+        print("Parallel speedup:", vectorized_ms / parallelized_ms)
 
-fn main() raises:
-    compare()
+    _ = t  # Make sure tensor isn't destroyed before benchmark is finished