GPU puzzles annotated
Modular's adaptation of CUDA puzzles (by Sasha Rush) in mojo, solved and annotated below.
Puzzle 1
Notes:
- be aware of host and device (CPU and GPU), sync and async
- enqueue operations in GPU stream, execute them async
from memory import UnsafePointer
from gpu import thread_idx
from gpu.host import DeviceContext
from testing import assert_equal
alias SIZE = 4
alias BLOCKS_PER_GRID = 1
alias THREADS_PER_BLOCK = SIZE
alias dtype = DType.float32
fn add_10(out: UnsafePointer[Scalar[dtype]], a: UnsafePointer[Scalar[dtype]]):
i = thread_idx.x
out[i] = 10 + a[i]
def main():
with DeviceContext() as ctx:
print(ctx.api()) # cuda
out = ctx.enqueue_create_buffer[dtype](SIZE) # gpu async
out = out.enqueue_fill(0) # gpu async
a = ctx.enqueue_create_buffer[dtype](SIZE)
a = a.enqueue_fill(0)
with a.map_to_host() as a_host: # sync at mapping to ensure buffer is created
for i in range(SIZE):
a_host[i] = i
ctx.enqueue_function[add_10](
out.unsafe_ptr(),
a.unsafe_ptr(),
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
) # gpu async
expected = ctx.enqueue_create_host_buffer[dtype](SIZE)
expected = expected.enqueue_fill(0) # gpu async
ctx.synchronize() # code would fail if sync after modifying expected i.e. first modify then fill 0 at sync time.
for i in range(SIZE):
expected[i] = i + 10
with out.map_to_host() as out_host:
print("out:", out_host)
print("expected:", expected)
for i in range(SIZE):
assert_equal(out_host[i], expected[i])
Puzzle 2
from memory import UnsafePointer
from gpu import thread_idx
from gpu.host import DeviceContext
from testing import assert_equal
alias SIZE = 4
alias BLOCKS_PER_GRID = 1
alias THREADS_PER_BLOCK = SIZE
alias dtype = DType.float32
fn add(
out: UnsafePointer[Scalar[dtype]],
a: UnsafePointer[Scalar[dtype]],
b: UnsafePointer[Scalar[dtype]],
):
i = thread_idx.x # 1d thread index map on 1d array data
out[i] = a[i] + b[i]
def main():
with DeviceContext() as ctx:
out = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0) # size of the input data is conveniently block_size
b = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
expected = ctx.enqueue_create_host_buffer[dtype](SIZE).enqueue_fill(0)
with a.map_to_host() as a_host, b.map_to_host() as b_host:
for i in range(SIZE):
a_host[i] = i
b_host[i] = i
expected[i] = a_host[i] + b_host[i]
ctx.enqueue_function[add](
out.unsafe_ptr(),
a.unsafe_ptr(),
b.unsafe_ptr(),
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
)
ctx.synchronize()
with out.map_to_host() as out_host:
print("out:", out_host)
print("expected:", expected)
for i in range(SIZE):
assert_equal(out_host[i], expected[i])
Puzzle 3
from memory import UnsafePointer
from gpu import thread_idx
from gpu.host import DeviceContext
from testing import assert_equal
alias SIZE = 4
alias BLOCKS_PER_GRID = 1
alias THREADS_PER_BLOCK = (8, 1) # more threads than data size
alias dtype = DType.float32
fn add_10_guard(
out: UnsafePointer[Scalar[dtype]],
a: UnsafePointer[Scalar[dtype]],
size: Int,
):
i = thread_idx.x
if i<size: # avoid out of bounds
out[i] = a[i] + 10.0
def main():
with DeviceContext() as ctx:
out = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
expected = ctx.enqueue_create_host_buffer[dtype](SIZE).enqueue_fill(0)
with a.map_to_host() as a_host:
for i in range(SIZE):
a_host[i] = i
expected[i] = i + 10
ctx.enqueue_function[add_10_guard](
out.unsafe_ptr(),
a.unsafe_ptr(),
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
)
ctx.synchronize()
with out.map_to_host() as out_host:
print("out:", out_host)
print("expected:", expected)
for i in range(SIZE):
assert_equal(out_host[i], expected[i])
Puzzle 4
with pointer
from memory import UnsafePointer
from gpu import thread_idx
from gpu.host import DeviceContext
from testing import assert_equal
alias SIZE = 2
alias BLOCKS_PER_GRID = 1
alias THREADS_PER_BLOCK = (3, 3)
alias dtype = DType.float32
# 2d block, 1d input, 1d output
fn add_10_2d(
out: UnsafePointer[Scalar[dtype]],
a: UnsafePointer[Scalar[dtype]],
size: Int,
):
row = thread_idx.y
col = thread_idx.x
if row < size and col<size: # prevent out of bounds
out[row * size + col] = a[row * size + col] + 10
def main():
with DeviceContext() as ctx:
out = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
expected = ctx.enqueue_create_host_buffer[dtype](
SIZE * SIZE
).enqueue_fill(0)
a = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
with a.map_to_host() as a_host:
# row-major
for i in range(SIZE):
for j in range(SIZE):
a_host[i * SIZE + j] = i * SIZE + j
expected[i * SIZE + j] = a_host[i * SIZE + j] + 10
ctx.enqueue_function[add_10_2d](
out.unsafe_ptr(),
a.unsafe_ptr(),
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
)
ctx.synchronize()
with out.map_to_host() as out_host:
print("out:", out_host)
print("expected:", expected)
for i in range(SIZE):
for j in range(SIZE):
assert_equal(out_host[i * SIZE + j], expected[i * SIZE + j])
with LayoutTensor
from gpu import thread_idx, block_dim, block_idx
from gpu.host import DeviceContext
from layout import Layout, LayoutTensor
from testing import assert_equal
alias SIZE = 2
alias BLOCKS_PER_GRID = 1
alias THREADS_PER_BLOCK = (3, 3)
alias dtype = DType.float32
alias layout = Layout.row_major(SIZE, SIZE)
# 2d block, 2d input, 2d output
fn add_10_2d(
out: LayoutTensor[mut=True, dtype, layout],
a: LayoutTensor[mut=True, dtype, layout],
size: Int,
):
row = thread_idx.y
col = thread_idx.x
if row < size and col < size:
out[row, col] = a[row, col] + 10 # index arithmetic is handled by LayoutTensor
def main():
with DeviceContext() as ctx:
out_buf = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
out_tensor = LayoutTensor[mut=True, dtype, layout](out_buf.unsafe_ptr()) # NOT created with ctx
print("out shape:", out_tensor.shape[0](), "x", out_tensor.shape[1]())
expected = ctx.enqueue_create_host_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
a = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
with a.map_to_host() as a_host:
for i in range(SIZE * SIZE):
a_host[i] = i
expected[i] = a_host[i] + 10
a_tensor = LayoutTensor[mut=True, dtype, layout](a.unsafe_ptr())
ctx.enqueue_function[add_10_2d](
out_tensor,
a_tensor,
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
)
ctx.synchronize()
with out_buf.map_to_host() as out_buf_host:
print("out:", out_buf_host)
print("expected:", expected)
for i in range(SIZE * SIZE):
assert_equal(out_buf_host[i], expected[i])