Skip to content

Commit 5547a38

Browse files
committed
Absolute naive matrix multiplication
1 parent 78af51a commit 5547a38

File tree

2 files changed

+453
-0
lines changed

2 files changed

+453
-0
lines changed

gpu_puzzles/naive_matmul.ipynb

Lines changed: 307 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,307 @@
1+
": {
2+
"kernelspec": {
3+
"display_name": "Python 3",
4+
"name": "python3"
5+
},
6+
"language_info": {
7+
"name": "python",
8+
"version": "3.11.11",
9+
"mimetype": "text/x-python",
10+
"codemirror_mode": {
11+
"name": "ipython",
12+
"version": 3
13+
},
14+
"pygments_lexer": "ipython3",
15+
"nbconvert_exporter": "python",
16+
"file_extension": ".py"
17+
},
18+
"kaggle": {
19+
"accelerator": "nvidiaTeslaT4",
20+
"dataSources": [],
21+
"dockerImageVersionId": 31041,
22+
"isInternetEnabled": true,
23+
"language": "python",
24+
"sourceType": "notebook",
25+
"isGpuEnabled": true
26+
},
27+
"colab": {
28+
"provenance": [],
29+
"gpuType": "T4"
30+
},
31+
"accelerator": "GPU"
32+
},
33+
"nbformat_minor": 0,
34+
"nbformat": 4,
35+
"cells": [
36+
{
37+
"cell_type": "code",
38+
"source": [
39+
"!curl -ssL https://magic.modular.com/ | bash"
40+
],
41+
"metadata": {
42+
"trusted": true,
43+
"id": "buOgxm25ONit"
44+
},
45+
"outputs": [],
46+
"execution_count": null
47+
},
48+
{
49+
"cell_type": "code",
50+
"source": [
51+
"import os\n",
52+
"os.environ['PATH'] +=':/root/.modular/bin'"
53+
],
54+
"metadata": {
55+
"trusted": true,
56+
"id": "FVZvyhRiONiw"
57+
},
58+
"outputs": [],
59+
"execution_count": null
60+
},
61+
{
62+
"cell_type": "code",
63+
"source": [
64+
"!magic init gpu_puzzles --format mojoproject"
65+
],
66+
"metadata": {
67+
"trusted": true,
68+
"id": "TqFD0EK0ONiw"
69+
},
70+
"outputs": [],
71+
"execution_count": null
72+
},
73+
{
74+
"cell_type": "code",
75+
"source": [
76+
"%cd gpu_puzzles/"
77+
],
78+
"metadata": {
79+
"trusted": true,
80+
"id": "k3Ddb6GcONiw"
81+
},
82+
"outputs": [],
83+
"execution_count": null
84+
},
85+
{
86+
"cell_type": "code",
87+
"source": [
88+
"%%writefile naive_matmaul.mojo\n",
89+
"\n",
90+
"### Dumb matrix multiplication\n",
91+
"### Simulate the CPU-style triple for-loop truly dumb matrix multiplication\n",
92+
"\n",
93+
"from gpu.host import DeviceContext, HostBuffer\n",
94+
"from gpu import thread_idx, block_idx, block_dim\n",
95+
"import random\n",
96+
"from layout import Layout, LayoutTensor\n",
97+
"from memory import UnsafePointer, memcpy\n",
98+
"from python import Python, PythonObject\n",
99+
"from testing import assert_true\n",
100+
"\n",
101+
"\n",
102+
"alias ROWS_A = 8\n",
103+
"alias COLS_A = 16\n",
104+
"alias ROWS_B = 16\n",
105+
"alias COLS_B = 8\n",
106+
"alias ROWS_C = 8\n",
107+
"alias COLS_C = 8\n",
108+
"\n",
109+
"\n",
110+
"alias MATRIX_MIN_ELEM = -5.0\n",
111+
"alias MATRIX_MAX_ELEM = 5.0\n",
112+
"\n",
113+
"alias dtype = DType.float32\n",
114+
"# Num threads per block\n",
115+
"alias THREADS = 1\n",
116+
"# Total numbers blocks in the grid\n",
117+
"alias BLOCKS = 1\n",
118+
"\n",
119+
"alias layout_a = Layout.row_major(ROWS_A, COLS_A)\n",
120+
"alias layout_b = Layout.row_major(ROWS_B, COLS_B)\n",
121+
"alias layout_c = Layout.row_major(ROWS_C, COLS_C)\n",
122+
"\n",
123+
"# alias Matrix = LayoutTensor[dtype, _, MutableAnyOrigin]\n",
124+
"alias Matrix = LayoutTensor[mut=True, dtype, _]\n",
125+
"\n",
126+
"\n",
127+
"fn naive_matmaul(\n",
128+
" A: UnsafePointer[Scalar[dtype]],\n",
129+
" B: UnsafePointer[Scalar[dtype]],\n",
130+
" C: UnsafePointer[Scalar[dtype]],\n",
131+
"):\n",
132+
" var tid = block_idx.x * block_dim.x + thread_idx.x\n",
133+
"\n",
134+
" if tid == 0:\n",
135+
" for i in range(ROWS_A):\n",
136+
" for j in range(COLS_B):\n",
137+
" for k in range(COLS_A):\n",
138+
" (C + i * COLS_C + j)[] += (A + i * COLS_A + k)[] * (\n",
139+
" B + k * COLS_B + j\n",
140+
" )[]\n",
141+
"\n",
142+
"\n",
143+
"# Initialize the matrix buffer with values in the range 0 to 100\n",
144+
"fn fill_buffer(buffer: HostBuffer[dtype]):\n",
145+
" # Randomize\n",
146+
" # random.seed()\n",
147+
" for i in range(len(buffer)):\n",
148+
" buffer[i] = random.random_float64(\n",
149+
" MATRIX_MIN_ELEM, MATRIX_MAX_ELEM\n",
150+
" ).cast[dtype]()[0]\n",
151+
"\n",
152+
"\n",
153+
"fn main():\n",
154+
" try:\n",
155+
" ctx = DeviceContext()\n",
156+
"\n",
157+
" buffer_a = ctx.enqueue_create_buffer[dtype](\n",
158+
" ROWS_A * COLS_A\n",
159+
" ).enqueue_fill(0.0)\n",
160+
" buffer_b = ctx.enqueue_create_buffer[dtype](\n",
161+
" ROWS_B * COLS_B\n",
162+
" ).enqueue_fill(0.0)\n",
163+
" buffer_c = ctx.enqueue_create_buffer[dtype](\n",
164+
" ROWS_C * COLS_C\n",
165+
" ).enqueue_fill(0.0)\n",
166+
"\n",
167+
" with buffer_a.map_to_host() as h_buffer_a:\n",
168+
" fill_buffer(h_buffer_a)\n",
169+
"\n",
170+
" with buffer_b.map_to_host() as h_buffer_b:\n",
171+
" fill_buffer(h_buffer_b)\n",
172+
"\n",
173+
" # matrix_a = LayoutTensor[dtype, layout_a, MutableAnyOrigin](buffer_a)\n",
174+
" # matrix_b = LayoutTensor[dtype, layout_b, MutableAnyOrigin](buffer_b)\n",
175+
" # matrix_c = LayoutTensor[dtype, layout_c, MutableAnyOrigin](buffer_c)\n",
176+
"\n",
177+
" ctx.enqueue_function[naive_matmaul](\n",
178+
" buffer_a.unsafe_ptr(),\n",
179+
" buffer_b.unsafe_ptr(),\n",
180+
" buffer_c.unsafe_ptr(),\n",
181+
" grid_dim=BLOCKS,\n",
182+
" block_dim=THREADS,\n",
183+
" )\n",
184+
"\n",
185+
" ctx.synchronize()\n",
186+
"\n",
187+
" with buffer_a.map_to_host() as h_buffer_a:\n",
188+
" with buffer_b.map_to_host() as h_buffer_b:\n",
189+
" with buffer_c.map_to_host() as h_buffer_c:\n",
190+
" assert_allclose(\n",
191+
" (ROWS_A, COLS_A, h_buffer_a),\n",
192+
" (ROWS_B, COLS_B, h_buffer_b),\n",
193+
" (ROWS_C, COLS_C, h_buffer_c),\n",
194+
" )\n",
195+
"\n",
196+
" except e:\n",
197+
" print(\"Prininting here: \", e)\n",
198+
"\n",
199+
"\n",
200+
"fn assert_allclose(\n",
201+
" buff_a_with_dims: (Int, Int, HostBuffer[dtype]),\n",
202+
" buff_b_with_dims: (Int, Int, HostBuffer[dtype]),\n",
203+
" buff_c_with_dims: (Int, Int, HostBuffer[dtype]),\n",
204+
") raises:\n",
205+
" a_rows, a_cols, a_buff = buff_a_with_dims\n",
206+
" matrix_a = reshape(to_ndarray(a_buff), a_rows, a_cols)\n",
207+
"\n",
208+
" b_rows, b_cols, b_buff = buff_b_with_dims\n",
209+
" matrix_b = reshape(to_ndarray(b_buff), b_rows, b_cols)\n",
210+
"\n",
211+
" c_rows, c_cols, c_buff = buff_c_with_dims\n",
212+
" matrix_c = reshape(to_ndarray(c_buff), c_rows, c_cols)\n",
213+
" np = Python.import_module(\"numpy\")\n",
214+
" assert_true(np.allclose(np.matmul(matrix_a, matrix_b), matrix_c))\n",
215+
" print(\"Assertion was successful\")\n",
216+
"\n",
217+
"\n",
218+
"fn to_ndarray(buffer: HostBuffer[dtype]) raises -> PythonObject:\n",
219+
" np = Python.import_module(\"numpy\")\n",
220+
" ndarray = np.zeros(len(buffer), dtype=np.float32)\n",
221+
" ndarray_ptr = ndarray_ptr[dtype](ndarray)\n",
222+
" buffer_ptr = buffer.unsafe_ptr()\n",
223+
" memcpy(ndarray_ptr, buffer_ptr, len(buffer))\n",
224+
" return ndarray\n",
225+
"\n",
226+
"\n",
227+
"fn reshape(ndarray: PythonObject, rows: Int, cols: Int) raises -> PythonObject:\n",
228+
" return ndarray.reshape(rows, cols)\n",
229+
"\n",
230+
"\n",
231+
"fn ndarray_ptr[\n",
232+
" dtype: DType\n",
233+
"](ndarray: PythonObject) raises -> UnsafePointer[Scalar[dtype]]:\n",
234+
" return ndarray.__array_interface__[\"data\"][0].unsafe_get_as_pointer[dtype]()"
235+
],
236+
"metadata": {
237+
"trusted": true,
238+
"execution": {
239+
"iopub.status.busy": "2025-05-17T17:07:37.176099Z",
240+
"iopub.execute_input": "2025-05-17T17:07:37.176782Z",
241+
"iopub.status.idle": "2025-05-17T17:07:37.183766Z",
242+
"shell.execute_reply.started": "2025-05-17T17:07:37.176750Z",
243+
"shell.execute_reply": "2025-05-17T17:07:37.183011Z"
244+
},
245+
"id": "IaxB1auxONix"
246+
},
247+
"outputs": [],
248+
"execution_count": null
249+
},
250+
{
251+
"cell_type": "code",
252+
"source": [
253+
"!magic run mojo naive_matmaul.mojo"
254+
],
255+
"metadata": {
256+
"trusted": true,
257+
"execution": {
258+
"iopub.status.busy": "2025-05-17T17:07:53.713699Z",
259+
"iopub.execute_input": "2025-05-17T17:07:53.713971Z",
260+
"iopub.status.idle": "2025-05-17T17:08:01.501894Z",
261+
"shell.execute_reply.started": "2025-05-17T17:07:53.713950Z",
262+
"shell.execute_reply": "2025-05-17T17:08:01.501214Z"
263+
},
264+
"colab": {
265+
"base_uri": "https://localhost:8080/"
266+
},
267+
"id": "h2k9wkDaONiz",
268+
"outputId": "20528bac-8ce4-4cb5-829c-c4ed968d7172"
269+
},
270+
"outputs": [
271+
{
272+
"output_type": "stream",
273+
"name": "stdout",
274+
"text": [
275+
"\u001b[32m⠁\u001b[0m \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment \r\u001b[2K\u001b[1m/content/gpu_puzzles/naive_matmaul.mojo:1:1: \u001b[0m\u001b[0;1;35mwarning: \u001b[0m\u001b[1mstruct 'HostBuffer' utilizes conformance to trait 'Copyable & Movable' but does not explicitly declare it (implicit conformance is deprecated)\n",
276+
"\u001b[0m### Dumb matrix multiplication\n",
277+
"\u001b[0;1;32m^\n",
278+
"\u001b[0m\u001b[1m/content/gpu_puzzles/naive_matmaul.mojo:1:1: \u001b[0m\u001b[0;1;35mwarning: \u001b[0m\u001b[1mstruct 'PythonObject' utilizes conformance to trait 'Boolable' but does not explicitly declare it (implicit conformance is deprecated)\n",
279+
"\u001b[0m### Dumb matrix multiplication\n",
280+
"\u001b[0;1;32m^\n",
281+
"\u001b[0mAssertion was successful\n"
282+
]
283+
}
284+
],
285+
"execution_count": 8
286+
},
287+
{
288+
"cell_type": "code",
289+
"source": [
290+
"!magic run mojo format naive_matmaul.mojo"
291+
],
292+
"metadata": {
293+
"trusted": true,
294+
"execution": {
295+
"iopub.status.busy": "2025-05-17T17:05:36.835819Z",
296+
"iopub.execute_input": "2025-05-17T17:05:36.836092Z",
297+
"iopub.status.idle": "2025-05-17T17:05:37.268163Z",
298+
"shell.execute_reply.started": "2025-05-17T17:05:36.836067Z",
299+
"shell.execute_reply": "2025-05-17T17:05:37.267496Z"
300+
},
301+
"id": "bSglX7bNONi0"
302+
},
303+
"outputs": [],
304+
"execution_count": null
305+
}
306+
]
307+
}

0 commit comments

Comments
 (0)