forked from modular/mojo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreduce.mojo
114 lines (92 loc) · 3.55 KB
/
reduce.mojo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# ===----------------------------------------------------------------------=== #
# Copyright (c) 2023, Modular Inc. All rights reserved.
#
# Licensed under the Apache License v2.0 with LLVM Exceptions:
# https://llvm.org/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ===----------------------------------------------------------------------=== #
# This sample implements a simple reduction operation on a
# large array of values to produce a single result.
# Reductions and scans are common algorithm patterns in parallel computing.
from benchmark import Benchmark
from time import now
from algorithm import sum
from random import rand
from memory.buffer import Buffer
# Simple array struct
struct ArrayInput:
var data: DTypePointer[DType.float32]
fn __init__(inout self, size: Int):
self.data = DTypePointer[DType.float32].alloc(size)
rand(self.data, size)
fn __del__(owned self):
self.data.free()
@always_inline
fn __getitem__(self, x: Int) -> Float32:
return self.data.load(x)
# Use the https://en.wikipedia.org/wiki/Kahan_summation_algorithm
# Simple summation of the array elements
fn reduce_sum_naive(data: ArrayInput, size: Int) -> Float32:
var sum = data[0]
var c: Float32 = 0.0
for i in range(size):
let y = data[i] - c
let t = sum + y
c = (t - sum) - y
sum = t
return sum
fn benchmark_naive_reduce_sum(size: Int) -> Float32:
print("Computing reduction sum for array num elements: ", size)
var A = ArrayInput(size)
# Prevent DCE
var mySum: Float32 = 0.0
@always_inline
@parameter
fn test_fn():
_ = reduce_sum_naive(A, size)
let bench_time = Float64(Benchmark().run[test_fn]())
return mySum
fn benchmark_stdlib_reduce_sum(size: Int) -> Float32:
# Allocate a Buffer and then use the Mojo stdlib Reduction class
# TODO: Use globals
# alias numElem = size
alias numElem = 1 << 30
# Can use either stack allocation or heap
# see stackalloc
# var A = Buffer[numElem, DType.float32].stack_allocation()
# see heapalloc
var B = DTypePointer[DType.float32].alloc(numElem)
var A = Buffer[numElem, DType.float32](B)
# initialize buffer
for i in range(numElem):
A[i] = Float32(i)
# Prevent DCE
var mySum: Float32 = 0.0
print("Computing reduction sum for array num elements: ", size)
@always_inline
@parameter
fn test_fn():
mySum = sum[numElem, DType.float32](A)
let bench_time = Float64(Benchmark().run[test_fn]())
return mySum
fn main():
# Number of array elements
let size = 1 << 21
print("# Reduction sum across a large array. The naive algorithm's ")
print("# computation time scales with the size of the array; while Mojo ")
print("# exhibits significantly better scaling...")
var eval_begin: Float64 = now()
var sum = benchmark_naive_reduce_sum(size)
var eval_end: Float64 = now()
var execution_time = Float64((eval_end - eval_begin)) / 1e6
print("Completed naive reduction sum: ", sum, " in ", execution_time, "ms")
eval_begin = now()
sum = benchmark_stdlib_reduce_sum(size)
eval_end = now()
execution_time = Float64((eval_end - eval_begin)) / 1e6
print("Completed stdlib reduction sum: ", sum, " in ", execution_time, "ms")