Skip to content

Commit 0147b47

Browse files
committed
Naive matrix multiplication
1 parent c090410 commit 0147b47

2 files changed

Lines changed: 309 additions & 0 deletions

File tree

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
### Use one one GPU thread for each column of the output matrix
2+
### Uses shared memory via stack_allocation
3+
4+
from gpu.host import DeviceContext, HostBuffer
5+
from gpu import thread_idx, block_idx, block_dim
6+
import random
7+
from layout import Layout, LayoutTensor
8+
from memory import UnsafePointer, memcpy, stack_allocation
9+
from python import Python, PythonObject
10+
from testing import assert_true
11+
from algorithm import vectorize
12+
from sys import simdwidthof, strided_load
13+
14+
15+
alias ROWS_A = 9
16+
alias COLS_A = 17
17+
alias ROWS_B = 17
18+
alias COLS_B = 7
19+
alias ROWS_C = ROWS_A
20+
alias COLS_C = COLS_B
21+
22+
alias MATRIX_MIN_ELEM = -5.0
23+
alias MATRIX_MAX_ELEM = 5.0
24+
25+
alias dtype = DType.float32
26+
# Num threads per block
27+
alias THREADS = (5, 5)
28+
# Total numbers blocks in the grid
29+
alias BLOCKS = (
30+
(COLS_C + THREADS[0] - 1) // THREADS[0],
31+
(ROWS_C + THREADS[1] - 1) // THREADS[1],
32+
)
33+
34+
alias layout_a = Layout.row_major(ROWS_A, COLS_A)
35+
alias layout_b = Layout.row_major(ROWS_B, COLS_B)
36+
alias layout_c = Layout.row_major(ROWS_C, COLS_C)
37+
38+
39+
alias MatrixA = LayoutTensor[dtype, layout_a, MutableAnyOrigin]
40+
alias MatrixB = LayoutTensor[dtype, layout_b, MutableAnyOrigin]
41+
alias MatrixC = LayoutTensor[dtype, layout_c, MutableAnyOrigin]
42+
alias Storage = LayoutTensor[
43+
dtype, Layout.row_major(1, simdwidthof[dtype]()), MutableAnyOrigin
44+
]
45+
46+
47+
fn matmul_thread_per_output_cell_vectorized(
48+
A: MatrixA, B: MatrixB, C: MatrixC, store: Storage
49+
):
50+
var i = block_idx.y * block_dim.y + thread_idx.y # Rows
51+
var j = block_idx.x * block_dim.x + thread_idx.x # Colums
52+
if i < ROWS_C and j < COLS_C:
53+
tile = stack_allocation[ROWS_B, Scalar[dtype]]()
54+
each_b_col = B.tile[ROWS_B, 1](0, j)
55+
for k in range(ROWS_B):
56+
tile[k] = each_b_col[k, 0][0]
57+
58+
@parameter
59+
fn dotproduct[simd_width: Int](idx: Int):
60+
C[i, j] += (
61+
A.load[width=simd_width](i, idx)
62+
* tile.load[width=simd_width](idx)
63+
).reduce_add()
64+
65+
vectorize[dotproduct, simdwidthof[dtype]()](ROWS_B)
66+
67+
68+
# Initialize the matrix buffer with values in the range 0 to 100
69+
fn fill_buffer(buffer: HostBuffer[dtype]):
70+
# Randomize
71+
random.seed()
72+
for i in range(len(buffer)):
73+
buffer[i] = random.random_float64(
74+
MATRIX_MIN_ELEM, MATRIX_MAX_ELEM
75+
).cast[dtype]()[0]
76+
77+
78+
fn main():
79+
try:
80+
ctx = DeviceContext()
81+
82+
buffer_a = ctx.enqueue_create_buffer[dtype](
83+
ROWS_A * COLS_A
84+
).enqueue_fill(0.0)
85+
buffer_b = ctx.enqueue_create_buffer[dtype](
86+
ROWS_B * COLS_B
87+
).enqueue_fill(0.0)
88+
buffer_c = ctx.enqueue_create_buffer[dtype](
89+
ROWS_C * COLS_C
90+
).enqueue_fill(0.0)
91+
92+
store = ctx.enqueue_create_buffer[dtype](
93+
simdwidthof[dtype]()
94+
).enqueue_fill(0.0)
95+
96+
with buffer_a.map_to_host() as h_buffer_a:
97+
fill_buffer(h_buffer_a)
98+
99+
with buffer_b.map_to_host() as h_buffer_b:
100+
fill_buffer(h_buffer_b)
101+
102+
matrix_a = MatrixA(buffer_a)
103+
matrix_b = MatrixB(buffer_b)
104+
matrix_c = MatrixC(buffer_c)
105+
storage = Storage(store)
106+
107+
ctx.enqueue_function[matmul_thread_per_output_cell_vectorized](
108+
matrix_a,
109+
matrix_b,
110+
matrix_c,
111+
storage,
112+
grid_dim=BLOCKS,
113+
block_dim=THREADS,
114+
)
115+
116+
ctx.synchronize()
117+
118+
with buffer_a.map_to_host() as h_buffer_a:
119+
with buffer_b.map_to_host() as h_buffer_b:
120+
with buffer_c.map_to_host() as h_buffer_c:
121+
assert_allclose(
122+
(ROWS_A, COLS_A, h_buffer_a),
123+
(ROWS_B, COLS_B, h_buffer_b),
124+
(ROWS_C, COLS_C, h_buffer_c),
125+
)
126+
127+
except e:
128+
print("Prininting here: ", e)
129+
130+
131+
fn assert_allclose(
132+
buff_a_with_dims: (Int, Int, HostBuffer[dtype]),
133+
buff_b_with_dims: (Int, Int, HostBuffer[dtype]),
134+
buff_c_with_dims: (Int, Int, HostBuffer[dtype]),
135+
) raises:
136+
a_rows, a_cols, a_buff = buff_a_with_dims
137+
matrix_a = reshape(to_ndarray(a_buff), a_rows, a_cols)
138+
139+
b_rows, b_cols, b_buff = buff_b_with_dims
140+
matrix_b = reshape(to_ndarray(b_buff), b_rows, b_cols)
141+
142+
c_rows, c_cols, c_buff = buff_c_with_dims
143+
matrix_c = reshape(to_ndarray(c_buff), c_rows, c_cols)
144+
np = Python.import_module("numpy")
145+
assert_true(np.allclose(np.matmul(matrix_a, matrix_b), matrix_c))
146+
print("Assertion was successful")
147+
148+
149+
fn to_ndarray(buffer: HostBuffer[dtype]) raises -> PythonObject:
150+
np = Python.import_module("numpy")
151+
ndarray = np.zeros(len(buffer), dtype=np.float32)
152+
ndarray_ptr = ndarray_ptr[dtype](ndarray)
153+
buffer_ptr = buffer.unsafe_ptr()
154+
memcpy(ndarray_ptr, buffer_ptr, len(buffer))
155+
return ndarray
156+
157+
158+
fn reshape(ndarray: PythonObject, rows: Int, cols: Int) raises -> PythonObject:
159+
return ndarray.reshape(rows, cols)
160+
161+
162+
fn ndarray_ptr[
163+
dtype: DType
164+
](ndarray: PythonObject) raises -> UnsafePointer[Scalar[dtype]]:
165+
return ndarray.__array_interface__["data"][0].unsafe_get_as_pointer[dtype]()
166+
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
### Matrix multiplication 1 GPU thread per output column
2+
### Simulate the CPU-style dumb matrix multiplication 1 thread per output column
3+
4+
from gpu.host import DeviceContext, HostBuffer
5+
from gpu import thread_idx, block_idx, block_dim
6+
import random
7+
from layout import Layout, LayoutTensor
8+
from memory import UnsafePointer, memcpy
9+
from python import Python, PythonObject
10+
from testing import assert_true
11+
12+
alias ROWS_A = 33
13+
alias COLS_A = 13
14+
alias ROWS_B = 13
15+
alias COLS_B = 8
16+
alias ROWS_C = ROWS_A
17+
alias COLS_C = COLS_B
18+
19+
alias MATRIX_MIN_ELEM = -5.0
20+
alias MATRIX_MAX_ELEM = 5.0
21+
22+
alias dtype = DType.float32
23+
# Num threads per block
24+
alias THREADS = COLS_C
25+
# Total numbers blocks in the grid
26+
alias BLOCKS = 1
27+
28+
alias layout_a = Layout.row_major(ROWS_A, COLS_A)
29+
alias layout_b = Layout.row_major(ROWS_B, COLS_B)
30+
alias layout_c = Layout.row_major(ROWS_C, COLS_C)
31+
32+
33+
alias MatrixA = LayoutTensor[dtype, layout_a, MutableAnyOrigin]
34+
alias MatrixB = LayoutTensor[dtype, layout_b, MutableAnyOrigin]
35+
alias MatrixC = LayoutTensor[dtype, layout_c, MutableAnyOrigin]
36+
37+
38+
fn naive_matmul_one_thread_per_col[
39+
a: Layout, b: Layout, c: Layout
40+
](A: MatrixA, B: MatrixB, C: MatrixC,):
41+
var tid = block_idx.x * block_dim.x + thread_idx.x
42+
43+
if tid < COLS_C: # Each thread id `tid` is cols of C or B
44+
for i in range(ROWS_A):
45+
for k in range(COLS_A):
46+
C[i, tid] += A[i, k] * B[k, tid]
47+
48+
49+
# Initialize the matrix buffer with values in the range 0 to 100
50+
fn fill_buffer(buffer: HostBuffer[dtype]):
51+
# Randomize
52+
random.seed()
53+
for i in range(len(buffer)):
54+
buffer[i] = random.random_float64(
55+
MATRIX_MIN_ELEM, MATRIX_MAX_ELEM
56+
).cast[dtype]()[0]
57+
58+
59+
fn main():
60+
try:
61+
ctx = DeviceContext()
62+
63+
buffer_a = ctx.enqueue_create_buffer[dtype](
64+
ROWS_A * COLS_A
65+
).enqueue_fill(0.0)
66+
buffer_b = ctx.enqueue_create_buffer[dtype](
67+
ROWS_B * COLS_B
68+
).enqueue_fill(0.0)
69+
buffer_c = ctx.enqueue_create_buffer[dtype](
70+
ROWS_C * COLS_C
71+
).enqueue_fill(0.0)
72+
73+
with buffer_a.map_to_host() as h_buffer_a:
74+
fill_buffer(h_buffer_a)
75+
76+
with buffer_b.map_to_host() as h_buffer_b:
77+
fill_buffer(h_buffer_b)
78+
79+
matrix_a = MatrixA(buffer_a)
80+
matrix_b = MatrixB(buffer_b)
81+
matrix_c = MatrixC(buffer_c)
82+
83+
ctx.enqueue_function[
84+
naive_matmul_one_thread_per_col[layout_a, layout_b, layout_c]
85+
](
86+
matrix_a,
87+
matrix_b,
88+
matrix_c,
89+
grid_dim=BLOCKS,
90+
block_dim=THREADS,
91+
)
92+
93+
ctx.synchronize()
94+
95+
with buffer_a.map_to_host() as h_buffer_a:
96+
with buffer_b.map_to_host() as h_buffer_b:
97+
with buffer_c.map_to_host() as h_buffer_c:
98+
assert_allclose(
99+
(ROWS_A, COLS_A, h_buffer_a),
100+
(ROWS_B, COLS_B, h_buffer_b),
101+
(ROWS_C, COLS_C, h_buffer_c),
102+
)
103+
104+
except e:
105+
print("Prininting here: ", e)
106+
107+
108+
fn assert_allclose(
109+
buff_a_with_dims: (Int, Int, HostBuffer[dtype]),
110+
buff_b_with_dims: (Int, Int, HostBuffer[dtype]),
111+
buff_c_with_dims: (Int, Int, HostBuffer[dtype]),
112+
) raises:
113+
a_rows, a_cols, a_buff = buff_a_with_dims
114+
matrix_a = reshape(to_ndarray(a_buff), a_rows, a_cols)
115+
116+
b_rows, b_cols, b_buff = buff_b_with_dims
117+
matrix_b = reshape(to_ndarray(b_buff), b_rows, b_cols)
118+
119+
c_rows, c_cols, c_buff = buff_c_with_dims
120+
matrix_c = reshape(to_ndarray(c_buff), c_rows, c_cols)
121+
np = Python.import_module("numpy")
122+
assert_true(np.allclose(np.matmul(matrix_a, matrix_b), matrix_c))
123+
print("Assertion was successful")
124+
125+
126+
fn to_ndarray(buffer: HostBuffer[dtype]) raises -> PythonObject:
127+
np = Python.import_module("numpy")
128+
ndarray = np.zeros(len(buffer), dtype=np.float32)
129+
ndarray_ptr = ndarray_ptr[dtype](ndarray)
130+
buffer_ptr = buffer.unsafe_ptr()
131+
memcpy(ndarray_ptr, buffer_ptr, len(buffer))
132+
return ndarray
133+
134+
135+
fn reshape(ndarray: PythonObject, rows: Int, cols: Int) raises -> PythonObject:
136+
return ndarray.reshape(rows, cols)
137+
138+
139+
fn ndarray_ptr[
140+
dtype: DType
141+
](ndarray: PythonObject) raises -> UnsafePointer[Scalar[dtype]]:
142+
return ndarray.__array_interface__["data"][0].unsafe_get_as_pointer[dtype]()
143+

0 commit comments

Comments
 (0)