GPU puzzles annotated

Modular's adaptation of CUDA puzzles (by Sasha Rush) in mojo, solved and annotated below.

Puzzle 1

Notes:

be aware of host and device (CPU and GPU), sync and async
enqueue operations in GPU stream, execute them async

from memory import UnsafePointer
from gpu import thread_idx
from gpu.host import DeviceContext
from testing import assert_equal

alias SIZE = 4
alias BLOCKS_PER_GRID = 1
alias THREADS_PER_BLOCK = SIZE
alias dtype = DType.float32

fn add_10(out: UnsafePointer[Scalar[dtype]], a: UnsafePointer[Scalar[dtype]]):
    i = thread_idx.x
    out[i] = 10 + a[i]

def main():
    with DeviceContext() as ctx:
        print(ctx.api()) # cuda
        out = ctx.enqueue_create_buffer[dtype](SIZE) # gpu async 
        out = out.enqueue_fill(0) # gpu async 
        a = ctx.enqueue_create_buffer[dtype](SIZE)
        a = a.enqueue_fill(0)
        with a.map_to_host() as a_host: # sync at mapping to ensure buffer is created
            for i in range(SIZE):
                a_host[i] = i

        ctx.enqueue_function[add_10](
            out.unsafe_ptr(),
            a.unsafe_ptr(),
            grid_dim=BLOCKS_PER_GRID,
            block_dim=THREADS_PER_BLOCK,
        ) # gpu async

        expected = ctx.enqueue_create_host_buffer[dtype](SIZE)
        expected = expected.enqueue_fill(0) # gpu async

        ctx.synchronize() # code would fail if sync after modifying expected i.e. first modify then fill 0 at sync time. 

        for i in range(SIZE): 
            expected[i] = i + 10

        with out.map_to_host() as out_host:
            print("out:", out_host)
            print("expected:", expected)
            for i in range(SIZE):
                assert_equal(out_host[i], expected[i])

Puzzle 2

from memory import UnsafePointer
from gpu import thread_idx
from gpu.host import DeviceContext
from testing import assert_equal

alias SIZE = 4
alias BLOCKS_PER_GRID = 1
alias THREADS_PER_BLOCK = SIZE
alias dtype = DType.float32

fn add(
    out: UnsafePointer[Scalar[dtype]],
    a: UnsafePointer[Scalar[dtype]],
    b: UnsafePointer[Scalar[dtype]],
):
    i = thread_idx.x # 1d thread index map on 1d array data 
    out[i] = a[i] + b[i]

def main():
    with DeviceContext() as ctx:
        out = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
        a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0) # size of the input data is conveniently block_size
        b = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0) 
        expected = ctx.enqueue_create_host_buffer[dtype](SIZE).enqueue_fill(0)
        with a.map_to_host() as a_host, b.map_to_host() as b_host:
            for i in range(SIZE):
                a_host[i] = i
                b_host[i] = i
                expected[i] = a_host[i] + b_host[i]

        ctx.enqueue_function[add](
            out.unsafe_ptr(),
            a.unsafe_ptr(),
            b.unsafe_ptr(),
            grid_dim=BLOCKS_PER_GRID,
            block_dim=THREADS_PER_BLOCK,
        )

        ctx.synchronize()

        with out.map_to_host() as out_host:
            print("out:", out_host)
            print("expected:", expected)
            for i in range(SIZE):
                assert_equal(out_host[i], expected[i])

Puzzle 3

from memory import UnsafePointer
from gpu import thread_idx
from gpu.host import DeviceContext
from testing import assert_equal


alias SIZE = 4
alias BLOCKS_PER_GRID = 1
alias THREADS_PER_BLOCK = (8, 1) # more threads than data size
alias dtype = DType.float32

fn add_10_guard(
    out: UnsafePointer[Scalar[dtype]],
    a: UnsafePointer[Scalar[dtype]],
    size: Int,
):
    i = thread_idx.x
    if i<size: # avoid out of bounds 
        out[i] = a[i] + 10.0 

def main():
    with DeviceContext() as ctx:
        out = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
        a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
        expected = ctx.enqueue_create_host_buffer[dtype](SIZE).enqueue_fill(0)

        with a.map_to_host() as a_host:
            for i in range(SIZE):
                a_host[i] = i
                expected[i] = i + 10

        ctx.enqueue_function[add_10_guard](
            out.unsafe_ptr(),
            a.unsafe_ptr(),
            SIZE,
            grid_dim=BLOCKS_PER_GRID,
            block_dim=THREADS_PER_BLOCK,
        )

        ctx.synchronize()

        with out.map_to_host() as out_host:
            print("out:", out_host)
            print("expected:", expected)
            for i in range(SIZE):
                assert_equal(out_host[i], expected[i])

Puzzle 4

with pointer

from memory import UnsafePointer
from gpu import thread_idx
from gpu.host import DeviceContext
from testing import assert_equal

alias SIZE = 2
alias BLOCKS_PER_GRID = 1
alias THREADS_PER_BLOCK = (3, 3)
alias dtype = DType.float32

#  2d block, 1d input, 1d output
fn add_10_2d(
    out: UnsafePointer[Scalar[dtype]],
    a: UnsafePointer[Scalar[dtype]],
    size: Int,
):
    row = thread_idx.y
    col = thread_idx.x
    if row < size and col<size: # prevent out of bounds
        out[row * size + col] = a[row * size + col] + 10

def main():
    with DeviceContext() as ctx:
        out = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
        expected = ctx.enqueue_create_host_buffer[dtype](
            SIZE * SIZE
        ).enqueue_fill(0)
        a = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
        with a.map_to_host() as a_host:
            # row-major
            for i in range(SIZE):
                for j in range(SIZE):
                    a_host[i * SIZE + j] = i * SIZE + j
                    expected[i * SIZE + j] = a_host[i * SIZE + j] + 10

        ctx.enqueue_function[add_10_2d](
            out.unsafe_ptr(),
            a.unsafe_ptr(),
            SIZE,
            grid_dim=BLOCKS_PER_GRID,
            block_dim=THREADS_PER_BLOCK,
        )

        ctx.synchronize()

        with out.map_to_host() as out_host:
            print("out:", out_host)
            print("expected:", expected)
            for i in range(SIZE):
                for j in range(SIZE):
                    assert_equal(out_host[i * SIZE + j], expected[i * SIZE + j])

with LayoutTensor

from gpu import thread_idx, block_dim, block_idx
from gpu.host import DeviceContext
from layout import Layout, LayoutTensor
from testing import assert_equal

alias SIZE = 2
alias BLOCKS_PER_GRID = 1
alias THREADS_PER_BLOCK = (3, 3)
alias dtype = DType.float32
alias layout = Layout.row_major(SIZE, SIZE)

# 2d block, 2d input, 2d output
fn add_10_2d(
    out: LayoutTensor[mut=True, dtype, layout],
    a: LayoutTensor[mut=True, dtype, layout],
    size: Int,
):
    row = thread_idx.y
    col = thread_idx.x
    if row < size and col < size: 
        out[row, col] = a[row, col] + 10 # index arithmetic is handled by LayoutTensor


def main():
    with DeviceContext() as ctx:
        out_buf = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
        out_tensor = LayoutTensor[mut=True, dtype, layout](out_buf.unsafe_ptr()) # NOT created with ctx
        print("out shape:", out_tensor.shape[0](), "x", out_tensor.shape[1]())

        expected = ctx.enqueue_create_host_buffer[dtype](SIZE * SIZE).enqueue_fill(0)

        a = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
        with a.map_to_host() as a_host:
            for i in range(SIZE * SIZE):
                a_host[i] = i
                expected[i] = a_host[i] + 10

        a_tensor = LayoutTensor[mut=True, dtype, layout](a.unsafe_ptr())

        ctx.enqueue_function[add_10_2d](
            out_tensor,
            a_tensor,
            SIZE,
            grid_dim=BLOCKS_PER_GRID,
            block_dim=THREADS_PER_BLOCK,
        )

        ctx.synchronize()

        with out_buf.map_to_host() as out_buf_host:
            print("out:", out_buf_host)
            print("expected:", expected)
            for i in range(SIZE * SIZE):
                assert_equal(out_buf_host[i], expected[i])