Skip to content

Commit

Permalink
Improves matmul and mandelbrot parallel notebooks and examples (modul…
Browse files Browse the repository at this point in the history
  • Loading branch information
jackos authored Sep 1, 2023
1 parent 2d25cee commit 51d357f
Show file tree
Hide file tree
Showing 4 changed files with 1,179 additions and 1,076 deletions.
210 changes: 210 additions & 0 deletions examples/mandelbrot.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
from benchmark import Benchmark
from complex import ComplexSIMD, ComplexFloat64
from math import iota
from python import Python
from runtime.llcl import num_cores, Runtime
from algorithm import parallelize, vectorize
from tensor import Tensor
from utils.index import Index

alias float_type = DType.float64
alias simd_width = simdwidthof[float_type]()

alias width = 960
alias height = 960
alias MAX_ITERS = 200

alias min_x = -2.0
alias max_x = 0.6
alias min_y = -1.5
alias max_y = 1.5


# Compute the number of steps to escape.
def mandelbrot_kernel(c: ComplexFloat64) -> Int:
z = c
for i in range(MAX_ITERS):
z = z * z + c
if z.squared_norm() > 4:
return i
return MAX_ITERS


def compute_mandelbrot() -> Tensor[float_type]:
# create a matrix. Each element of the matrix corresponds to a pixel
m = Tensor[float_type](width, height)

dx = (max_x - min_x) / width
dy = (max_y - min_y) / height

y = min_y
for j in range(height):
x = min_x
for i in range(width):
m[Index(i, j)] = mandelbrot_kernel(ComplexFloat64(x, y))
x += dx
y += dy
return m


def show_plot(tensor: Tensor[float_type]):
np = Python.import_module("numpy")
plt = Python.import_module("matplotlib.pyplot")
colors = Python.import_module("matplotlib.colors")

numpy_array = np.zeros((height, width), np.float32)

for col in range(width):
for row in range(height):
numpy_array.itemset((row, col), tensor[col, row])

fig = plt.figure(1, [10, 10 * height // width], 64)
ax = fig.add_axes([0.0, 0.0, 1.0, 1.0], False, 1)
light = colors.LightSource(315, 10, 0, 1, 1, 0)

image = light.shade(
numpy_array, plt.cm.hot, colors.PowerNorm(0.3), "hsv", 0, 0, 1.5
)

plt.imshow(image)
plt.axis("off")
plt.savefig("out.png")
plt.show()


fn mandelbrot_kernel_SIMD[
simd_width: Int
](c: ComplexSIMD[float_type, simd_width]) -> SIMD[float_type, simd_width]:
"""A vectorized implementation of the inner mandelbrot computation."""
var z = ComplexSIMD[float_type, simd_width](0, 0)
var iters = SIMD[float_type, simd_width](0)

var in_set_mask: SIMD[DType.bool, simd_width] = True
for i in range(MAX_ITERS):
if not in_set_mask.reduce_or():
break
in_set_mask = z.squared_norm() <= 4
iters = in_set_mask.select(iters + 1, iters)
z = z.squared_add(c)

return iters


fn vectorized():
let m = Tensor[float_type](width, height)

@parameter
fn worker(col: Int):
let scale_x = (max_x - min_x) / width
let scale_y = (max_y - min_y) / height

@parameter
fn compute_vector[simd_width: Int](row: Int):
"""Each time we oeprate on a `simd_width` vector of pixels."""
let cy = min_y + (row + iota[float_type, simd_width]()) * scale_y
let cx = min_x + col * scale_x
let c = ComplexSIMD[float_type, simd_width](cx, cy)
m.simd_store[simd_width](
Index(col, row), mandelbrot_kernel_SIMD[simd_width](c)
)

# Vectorize the call to compute_vector where call gets a chunk of pixels.
vectorize[simd_width, compute_vector](width)

@parameter
fn bench[simd_width: Int]():
for col in range(width):
worker(col)

let vectorized = Benchmark().run[bench[simd_width]]() / 1e6
print("Vectorized", ":", vectorized, "ms")

try:
_ = show_plot(m)
except e:
print("failed to show plot:", e.value)


fn parallelized():
let m = Tensor[float_type](width, height)

@parameter
fn worker(col: Int):
let scale_x = (max_x - min_x) / width
let scale_y = (max_y - min_y) / height

@parameter
fn compute_vector[simd_width: Int](row: Int):
"""Each time we oeprate on a `simd_width` vector of pixels."""
let cy = min_y + (row + iota[float_type, simd_width]()) * scale_y
let cx = min_x + col * scale_x
let c = ComplexSIMD[float_type, simd_width](cx, cy)
m.simd_store[simd_width](
Index(col, row), mandelbrot_kernel_SIMD[simd_width](c)
)

# Vectorize the call to compute_vector where call gets a chunk of pixels.
vectorize[simd_width, compute_vector](width)

with Runtime() as rt:

@parameter
fn bench_parallel[simd_width: Int]():
parallelize[worker](rt, width, 5 * num_cores())

alias simd_width = simdwidthof[DType.float64]()
let parallelized = Benchmark().run[bench_parallel[simd_width]]() / 1e6
print("Parallelized:", parallelized, "ms")

try:
_ = show_plot(m)
except e:
print("failed to show plot:", e.value)


fn compare():
let m = Tensor[float_type](width, height)

@parameter
fn worker(col: Int):
let scale_x = (max_x - min_x) / width
let scale_y = (max_y - min_y) / height

@parameter
fn compute_vector[simd_width: Int](row: Int):
"""Each time we oeprate on a `simd_width` vector of pixels."""
let cy = min_y + (row + iota[float_type, simd_width]()) * scale_y
let cx = min_x + col * scale_x
let c = ComplexSIMD[float_type, simd_width](cx, cy)
m.simd_store[simd_width](
Index(col, row), mandelbrot_kernel_SIMD[simd_width](c)
)

# Vectorize the call to compute_vector where call gets a chunk of pixels.
vectorize[simd_width, compute_vector](width)

# Vectorized
@parameter
fn bench[simd_width: Int]():
for col in range(width):
worker(col)

let vectorized = Benchmark().run[bench[simd_width]]() / 1e6
print("Number of hardware cores:", num_cores())
print("Vectorized:", vectorized, "ms")

# Parallelized
with Runtime() as rt:

@parameter
fn bench_parallel[simd_width: Int]():
parallelize[worker](rt, width, 5 * num_cores())

alias simd_width = simdwidthof[DType.float64]()
let parallelized = Benchmark().run[bench_parallel[simd_width]]() / 1e6
print("Parallelized:", parallelized, "ms")
print("Parallel speedup:", vectorized / parallelized)


fn main() raises:
compare()
51 changes: 28 additions & 23 deletions examples/matmul.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ from algorithm import vectorize, parallelize, vectorize_unroll
from algorithm import Static2DTileUnitFunc as Tile2DFunc
from python.object import PythonObject
from python.python import Python, _destroy_python, _init_python
from runtime.llcl import Runtime


struct Matrix:
Expand Down Expand Up @@ -82,7 +83,7 @@ fn run_matmul_python(M: Int, N: Int, K: Int) -> Float64:
return gflops


fn matmul_naive(C: Matrix, A: Matrix, B: Matrix):
fn matmul_naive(C: Matrix, A: Matrix, B: Matrix, _rt: Runtime):
for m in range(C.rows):
for k in range(A.cols):
for n in range(C.cols):
Expand All @@ -93,7 +94,7 @@ fn matmul_naive(C: Matrix, A: Matrix, B: Matrix):
alias nelts = simdwidthof[DType.float32]() # The SIMD vector width.


fn matmul_vectorized_0(C: Matrix, A: Matrix, B: Matrix):
fn matmul_vectorized_0(C: Matrix, A: Matrix, B: Matrix, _rt: Runtime):
for m in range(C.rows):
for k in range(A.cols):
for nv in range(0, C.cols, nelts):
Expand All @@ -108,7 +109,7 @@ fn matmul_vectorized_0(C: Matrix, A: Matrix, B: Matrix):

# Simplify the code by using the builtin vectorize function
# from Functional import vectorize
fn matmul_vectorized_1(C: Matrix, A: Matrix, B: Matrix):
fn matmul_vectorized_1(C: Matrix, A: Matrix, B: Matrix, _rt: Runtime):
for m in range(C.rows):
for k in range(A.cols):

Expand All @@ -123,7 +124,7 @@ fn matmul_vectorized_1(C: Matrix, A: Matrix, B: Matrix):

# Parallelize the code by using the builtin parallelize function
# from Functional import parallelize
fn matmul_parallelized(C: Matrix, A: Matrix, B: Matrix):
fn matmul_parallelized(C: Matrix, A: Matrix, B: Matrix, rt: Runtime):
@parameter
fn calc_row(m: Int):
for k in range(A.cols):
Expand All @@ -136,7 +137,7 @@ fn matmul_parallelized(C: Matrix, A: Matrix, B: Matrix):

vectorize[nelts, dot](C.cols)

parallelize[calc_row](C.rows)
parallelize[calc_row](rt, C.rows)


# Perform 2D tiling on the iteration space defined by end_x and end_y.
Expand All @@ -148,7 +149,7 @@ fn tile[tiled_fn: Tile2DFunc, tile_x: Int, tile_y: Int](end_x: Int, end_y: Int):


# Use the above tile function to perform tiled matmul.
fn matmul_tiled_parallelized(C: Matrix, A: Matrix, B: Matrix):
fn matmul_tiled_parallelized(C: Matrix, A: Matrix, B: Matrix, rt: Runtime):
@parameter
fn calc_row(m: Int):
@parameter
Expand All @@ -172,12 +173,14 @@ fn matmul_tiled_parallelized(C: Matrix, A: Matrix, B: Matrix):
alias tile_size = 4
tile[calc_tile, nelts * tile_size, tile_size](A.cols, C.cols)

parallelize[calc_row](C.rows)
parallelize[calc_row](rt, C.rows)


# Unroll the vectorized loop by a constant factor.
# from Functional import vectorize_unroll
fn matmul_tiled_unrolled_parallelized(C: Matrix, A: Matrix, B: Matrix):
fn matmul_tiled_unrolled_parallelized(
C: Matrix, A: Matrix, B: Matrix, rt: Runtime
):
@parameter
fn calc_row(m: Int):
@parameter
Expand All @@ -202,31 +205,33 @@ fn matmul_tiled_unrolled_parallelized(C: Matrix, A: Matrix, B: Matrix):
alias tile_size = 4
tile[calc_tile, nelts * tile_size, tile_size](A.cols, C.cols)

parallelize[calc_row](C.rows)
parallelize[calc_row](rt, C.rows)


@always_inline
fn benchmark[
func: fn (Matrix, Matrix, Matrix) -> None
func: fn (Matrix, Matrix, Matrix, Runtime) -> None
](M: Int, N: Int, K: Int, base_gflops: Float64, str: String):
var C = Matrix(M, N)
C.zero()
var A = Matrix(M, K)
var B = Matrix(K, N)

@always_inline
@parameter
fn test_fn():
_ = func(C, A, B)

let secs = Float64(Benchmark().run[test_fn]()) / 1_000_000_000
# Prevent the matrices from being freed before the benchmark run
_ = (A, B, C)
let gflops = ((2 * M * N * K) / secs) / 1e9
let speedup: Float64 = gflops / base_gflops
# print(gflops, "GFLOP/s", speedup, " speedup")
print(str)
print(gflops, "GFLOP/s <>", speedup.to_int(), "x speedup over Python")
with Runtime() as rt:

@always_inline
@parameter
fn test_fn():
_ = func(C, A, B, rt)

let secs = Float64(Benchmark().run[test_fn]()) / 1_000_000_000
# Prevent the matrices from being freed before the benchmark run
_ = (A, B, C)
let gflops = ((2 * M * N * K) / secs) / 1e9
let speedup: Float64 = gflops / base_gflops
# print(gflops, "GFLOP/s", speedup, " speedup")
print(str)
print(gflops, "GFLOP/s <>", speedup.to_int(), "x speedup over Python")


fn main():
Expand Down
Loading

0 comments on commit 51d357f

Please sign in to comment.