Naive matrix multiplication

ratulb · ratulb · commit 0147b47745c7 · 2025-05-24T07:47:53.000Z
diff --git a/gpu_puzzles/matmul_thread_per_output_cell_vectorized.mojo b/gpu_puzzles/matmul_thread_per_output_cell_vectorized.mojo
@@ -0,0 +1,166 @@
+### Use one one GPU thread for each column of the output matrix
+### Uses shared memory via stack_allocation
+
+from gpu.host import DeviceContext, HostBuffer
+from gpu import thread_idx, block_idx, block_dim
+import random
+from layout import Layout, LayoutTensor
+from memory import UnsafePointer, memcpy, stack_allocation
+from python import Python, PythonObject
+from testing import assert_true
+from algorithm import vectorize
+from sys import simdwidthof, strided_load
+
+
+alias ROWS_A = 9
+alias COLS_A = 17
+alias ROWS_B = 17
+alias COLS_B = 7
+alias ROWS_C = ROWS_A
+alias COLS_C = COLS_B
+
+alias MATRIX_MIN_ELEM = -5.0
+alias MATRIX_MAX_ELEM = 5.0
+
+alias dtype = DType.float32
+# Num threads per block
+alias THREADS = (5, 5)
+# Total numbers blocks in the grid
+alias BLOCKS = (
+    (COLS_C + THREADS[0] - 1) // THREADS[0],
+    (ROWS_C + THREADS[1] - 1) // THREADS[1],
+)
+
+alias layout_a = Layout.row_major(ROWS_A, COLS_A)
+alias layout_b = Layout.row_major(ROWS_B, COLS_B)
+alias layout_c = Layout.row_major(ROWS_C, COLS_C)
+
+
+alias MatrixA = LayoutTensor[dtype, layout_a, MutableAnyOrigin]
+alias MatrixB = LayoutTensor[dtype, layout_b, MutableAnyOrigin]
+alias MatrixC = LayoutTensor[dtype, layout_c, MutableAnyOrigin]
+alias Storage = LayoutTensor[
+    dtype, Layout.row_major(1, simdwidthof[dtype]()), MutableAnyOrigin
+]
+
+
+fn matmul_thread_per_output_cell_vectorized(
+    A: MatrixA, B: MatrixB, C: MatrixC, store: Storage
+):
+    var i = block_idx.y * block_dim.y + thread_idx.y  # Rows
+    var j = block_idx.x * block_dim.x + thread_idx.x  # Colums
+    if i < ROWS_C and j < COLS_C:
+        tile = stack_allocation[ROWS_B, Scalar[dtype]]()
+        each_b_col = B.tile[ROWS_B, 1](0, j)
+        for k in range(ROWS_B):
+            tile[k] = each_b_col[k, 0][0]
+
+        @parameter
+        fn dotproduct[simd_width: Int](idx: Int):
+            C[i, j] += (
+                A.load[width=simd_width](i, idx)
+                * tile.load[width=simd_width](idx)
+            ).reduce_add()
+
+        vectorize[dotproduct, simdwidthof[dtype]()](ROWS_B)
+
+
+# Initialize the matrix buffer with values in the range 0 to 100
+fn fill_buffer(buffer: HostBuffer[dtype]):
+    # Randomize
+    random.seed()
+    for i in range(len(buffer)):
+        buffer[i] = random.random_float64(
+            MATRIX_MIN_ELEM, MATRIX_MAX_ELEM
+        ).cast[dtype]()[0]
+
+
+fn main():
+    try:
+        ctx = DeviceContext()
+
+        buffer_a = ctx.enqueue_create_buffer[dtype](
+            ROWS_A * COLS_A
+        ).enqueue_fill(0.0)
+        buffer_b = ctx.enqueue_create_buffer[dtype](
+            ROWS_B * COLS_B
+        ).enqueue_fill(0.0)
+        buffer_c = ctx.enqueue_create_buffer[dtype](
+            ROWS_C * COLS_C
+        ).enqueue_fill(0.0)
+
+        store = ctx.enqueue_create_buffer[dtype](
+            simdwidthof[dtype]()
+        ).enqueue_fill(0.0)
+
+        with buffer_a.map_to_host() as h_buffer_a:
+            fill_buffer(h_buffer_a)
+
+        with buffer_b.map_to_host() as h_buffer_b:
+            fill_buffer(h_buffer_b)
+
+        matrix_a = MatrixA(buffer_a)
+        matrix_b = MatrixB(buffer_b)
+        matrix_c = MatrixC(buffer_c)
+        storage = Storage(store)
+
+        ctx.enqueue_function[matmul_thread_per_output_cell_vectorized](
+            matrix_a,
+            matrix_b,
+            matrix_c,
+            storage,
+            grid_dim=BLOCKS,
+            block_dim=THREADS,
+        )
+
+        ctx.synchronize()
+
+        with buffer_a.map_to_host() as h_buffer_a:
+            with buffer_b.map_to_host() as h_buffer_b:
+                with buffer_c.map_to_host() as h_buffer_c:
+                    assert_allclose(
+                        (ROWS_A, COLS_A, h_buffer_a),
+                        (ROWS_B, COLS_B, h_buffer_b),
+                        (ROWS_C, COLS_C, h_buffer_c),
+                    )
+
+    except e:
+        print("Prininting here: ", e)
+
+
+fn assert_allclose(
+    buff_a_with_dims: (Int, Int, HostBuffer[dtype]),
+    buff_b_with_dims: (Int, Int, HostBuffer[dtype]),
+    buff_c_with_dims: (Int, Int, HostBuffer[dtype]),
+) raises:
+    a_rows, a_cols, a_buff = buff_a_with_dims
+    matrix_a = reshape(to_ndarray(a_buff), a_rows, a_cols)
+
+    b_rows, b_cols, b_buff = buff_b_with_dims
+    matrix_b = reshape(to_ndarray(b_buff), b_rows, b_cols)
+
+    c_rows, c_cols, c_buff = buff_c_with_dims
+    matrix_c = reshape(to_ndarray(c_buff), c_rows, c_cols)
+    np = Python.import_module("numpy")
+    assert_true(np.allclose(np.matmul(matrix_a, matrix_b), matrix_c))
+    print("Assertion was successful")
+
+
+fn to_ndarray(buffer: HostBuffer[dtype]) raises -> PythonObject:
+    np = Python.import_module("numpy")
+    ndarray = np.zeros(len(buffer), dtype=np.float32)
+    ndarray_ptr = ndarray_ptr[dtype](ndarray)
+    buffer_ptr = buffer.unsafe_ptr()
+    memcpy(ndarray_ptr, buffer_ptr, len(buffer))
+    return ndarray
+
+
+fn reshape(ndarray: PythonObject, rows: Int, cols: Int) raises -> PythonObject:
+    return ndarray.reshape(rows, cols)
+
+
+fn ndarray_ptr[
+    dtype: DType
+](ndarray: PythonObject) raises -> UnsafePointer[Scalar[dtype]]:
+    return ndarray.__array_interface__["data"][0].unsafe_get_as_pointer[dtype]()
+
diff --git a/gpu_puzzles/naive_matmul_one_thread_per_col.mojo b/gpu_puzzles/naive_matmul_one_thread_per_col.mojo
@@ -0,0 +1,143 @@
+### Matrix multiplication 1 GPU thread per output column
+### Simulate the CPU-style dumb matrix multiplication 1 thread per output column
+
+from gpu.host import DeviceContext, HostBuffer
+from gpu import thread_idx, block_idx, block_dim
+import random
+from layout import Layout, LayoutTensor
+from memory import UnsafePointer, memcpy
+from python import Python, PythonObject
+from testing import assert_true
+
+alias ROWS_A = 33
+alias COLS_A = 13
+alias ROWS_B = 13
+alias COLS_B = 8
+alias ROWS_C = ROWS_A
+alias COLS_C = COLS_B
+
+alias MATRIX_MIN_ELEM = -5.0
+alias MATRIX_MAX_ELEM = 5.0
+
+alias dtype = DType.float32
+# Num threads per block
+alias THREADS = COLS_C
+# Total numbers blocks in the grid
+alias BLOCKS = 1
+
+alias layout_a = Layout.row_major(ROWS_A, COLS_A)
+alias layout_b = Layout.row_major(ROWS_B, COLS_B)
+alias layout_c = Layout.row_major(ROWS_C, COLS_C)
+
+
+alias MatrixA = LayoutTensor[dtype, layout_a, MutableAnyOrigin]
+alias MatrixB = LayoutTensor[dtype, layout_b, MutableAnyOrigin]
+alias MatrixC = LayoutTensor[dtype, layout_c, MutableAnyOrigin]
+
+
+fn naive_matmul_one_thread_per_col[
+    a: Layout, b: Layout, c: Layout
+](A: MatrixA, B: MatrixB, C: MatrixC,):
+    var tid = block_idx.x * block_dim.x + thread_idx.x
+
+    if tid < COLS_C:  # Each thread id `tid` is cols of C or B
+        for i in range(ROWS_A):
+            for k in range(COLS_A):
+                C[i, tid] += A[i, k] * B[k, tid]
+
+
+# Initialize the matrix buffer with values in the range 0 to 100
+fn fill_buffer(buffer: HostBuffer[dtype]):
+    # Randomize
+    random.seed()
+    for i in range(len(buffer)):
+        buffer[i] = random.random_float64(
+            MATRIX_MIN_ELEM, MATRIX_MAX_ELEM
+        ).cast[dtype]()[0]
+
+
+fn main():
+    try:
+        ctx = DeviceContext()
+
+        buffer_a = ctx.enqueue_create_buffer[dtype](
+            ROWS_A * COLS_A
+        ).enqueue_fill(0.0)
+        buffer_b = ctx.enqueue_create_buffer[dtype](
+            ROWS_B * COLS_B
+        ).enqueue_fill(0.0)
+        buffer_c = ctx.enqueue_create_buffer[dtype](
+            ROWS_C * COLS_C
+        ).enqueue_fill(0.0)
+
+        with buffer_a.map_to_host() as h_buffer_a:
+            fill_buffer(h_buffer_a)
+
+        with buffer_b.map_to_host() as h_buffer_b:
+            fill_buffer(h_buffer_b)
+
+        matrix_a = MatrixA(buffer_a)
+        matrix_b = MatrixB(buffer_b)
+        matrix_c = MatrixC(buffer_c)
+
+        ctx.enqueue_function[
+            naive_matmul_one_thread_per_col[layout_a, layout_b, layout_c]
+        ](
+            matrix_a,
+            matrix_b,
+            matrix_c,
+            grid_dim=BLOCKS,
+            block_dim=THREADS,
+        )
+
+        ctx.synchronize()
+
+        with buffer_a.map_to_host() as h_buffer_a:
+            with buffer_b.map_to_host() as h_buffer_b:
+                with buffer_c.map_to_host() as h_buffer_c:
+                    assert_allclose(
+                        (ROWS_A, COLS_A, h_buffer_a),
+                        (ROWS_B, COLS_B, h_buffer_b),
+                        (ROWS_C, COLS_C, h_buffer_c),
+                    )
+
+    except e:
+        print("Prininting here: ", e)
+
+
+fn assert_allclose(
+    buff_a_with_dims: (Int, Int, HostBuffer[dtype]),
+    buff_b_with_dims: (Int, Int, HostBuffer[dtype]),
+    buff_c_with_dims: (Int, Int, HostBuffer[dtype]),
+) raises:
+    a_rows, a_cols, a_buff = buff_a_with_dims
+    matrix_a = reshape(to_ndarray(a_buff), a_rows, a_cols)
+
+    b_rows, b_cols, b_buff = buff_b_with_dims
+    matrix_b = reshape(to_ndarray(b_buff), b_rows, b_cols)
+
+    c_rows, c_cols, c_buff = buff_c_with_dims
+    matrix_c = reshape(to_ndarray(c_buff), c_rows, c_cols)
+    np = Python.import_module("numpy")
+    assert_true(np.allclose(np.matmul(matrix_a, matrix_b), matrix_c))
+    print("Assertion was successful")
+
+
+fn to_ndarray(buffer: HostBuffer[dtype]) raises -> PythonObject:
+    np = Python.import_module("numpy")
+    ndarray = np.zeros(len(buffer), dtype=np.float32)
+    ndarray_ptr = ndarray_ptr[dtype](ndarray)
+    buffer_ptr = buffer.unsafe_ptr()
+    memcpy(ndarray_ptr, buffer_ptr, len(buffer))
+    return ndarray
+
+
+fn reshape(ndarray: PythonObject, rows: Int, cols: Int) raises -> PythonObject:
+    return ndarray.reshape(rows, cols)
+
+
+fn ndarray_ptr[
+    dtype: DType
+](ndarray: PythonObject) raises -> UnsafePointer[Scalar[dtype]]:
+    return ndarray.__array_interface__["data"][0].unsafe_get_as_pointer[dtype]()
+