Skip to content

Commit 8ed8707

Browse files
committed
Add constant to 2nd tensor
1 parent 7c13bcd commit 8ed8707

File tree

2 files changed

+278
-0
lines changed

2 files changed

+278
-0
lines changed

gpu_puzzles/add_10_2d.ipynb

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
{
2+
"metadata": {
3+
"kernelspec": {
4+
"display_name": "Python 3",
5+
"name": "python3"
6+
},
7+
"language_info": {
8+
"name": "python",
9+
"version": "3.11.11",
10+
"mimetype": "text/x-python",
11+
"codemirror_mode": {
12+
"name": "ipython",
13+
"version": 3
14+
},
15+
"pygments_lexer": "ipython3",
16+
"nbconvert_exporter": "python",
17+
"file_extension": ".py"
18+
},
19+
"kaggle": {
20+
"accelerator": "nvidiaTeslaT4",
21+
"dataSources": [],
22+
"dockerImageVersionId": 31011,
23+
"isInternetEnabled": true,
24+
"language": "python",
25+
"sourceType": "notebook",
26+
"isGpuEnabled": true
27+
},
28+
"colab": {
29+
"provenance": [],
30+
"gpuType": "T4"
31+
},
32+
"accelerator": "GPU"
33+
},
34+
"nbformat_minor": 0,
35+
"nbformat": 4,
36+
"cells": [
37+
{
38+
"cell_type": "code",
39+
"source": [
40+
"!nvcc --version\n",
41+
"\n",
42+
"!nvidia-smi"
43+
],
44+
"metadata": {
45+
"trusted": true,
46+
"id": "6V0kOh2GuD3g"
47+
},
48+
"outputs": [],
49+
"execution_count": null
50+
},
51+
{
52+
"cell_type": "code",
53+
"source": [
54+
"!curl -ssL https://magic.modular.com/ | bash"
55+
],
56+
"metadata": {
57+
"trusted": true,
58+
"id": "UYmfAndVuD3h"
59+
},
60+
"outputs": [],
61+
"execution_count": null
62+
},
63+
{
64+
"cell_type": "code",
65+
"source": [
66+
"import os\n",
67+
"os.environ['PATH'] +=':/root/.modular/bin'\n",
68+
"\n"
69+
],
70+
"metadata": {
71+
"trusted": true,
72+
"id": "TUOeZZECuD3i"
73+
},
74+
"outputs": [],
75+
"execution_count": null
76+
},
77+
{
78+
"cell_type": "code",
79+
"source": [
80+
"!magic init gpu_puzzles --format mojoproject"
81+
],
82+
"metadata": {
83+
"trusted": true,
84+
"id": "rDmQeBRVuD3i"
85+
},
86+
"outputs": [],
87+
"execution_count": null
88+
},
89+
{
90+
"cell_type": "code",
91+
"source": [
92+
"%cd gpu_puzzles/"
93+
],
94+
"metadata": {
95+
"trusted": true,
96+
"id": "tNMsV7VwuD3i"
97+
},
98+
"outputs": [],
99+
"execution_count": null
100+
},
101+
{
102+
"cell_type": "code",
103+
"source": [
104+
"%%writefile add_10_2d.mojo\n",
105+
"### Add a constant 10\n",
106+
"### Implement a kernel that adds 10 to each position of 2d matrix a and stores it in out 2d matrix.\n",
107+
"\n",
108+
"\n",
109+
"from gpu.host import DeviceContext\n",
110+
"from memory import UnsafePointer\n",
111+
"from gpu import thread_idx, block_dim\n",
112+
"from testing import assert_equal\n",
113+
"\n",
114+
"alias SIZE = 2\n",
115+
"alias BLOCKS_PER_GRID = 1\n",
116+
"alias THREADS_PER_BLOCK = (3,3)\n",
117+
"alias dtype = DType.float32\n",
118+
"\n",
119+
"\n",
120+
"fn add_10_2d(\n",
121+
" out: UnsafePointer[Scalar[dtype]], array: UnsafePointer[Scalar[dtype]], size: Int\n",
122+
"):\n",
123+
" tid = thread_idx.z * (block_dim.y * block_dim.x) + thread_idx.y * block_dim.x + thread_idx.x\n",
124+
" if tid < size * size:\n",
125+
" out[tid] = array[tid] + 10\n",
126+
"\n",
127+
"\n",
128+
"fn main():\n",
129+
" try:\n",
130+
" ctx = DeviceContext()\n",
131+
" d_array_buff = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)\n",
132+
" d_out_buff = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)\n",
133+
" expected = ctx.enqueue_create_host_buffer[dtype](SIZE * SIZE).enqueue_fill(0)\n",
134+
"\n",
135+
"\n",
136+
" with d_array_buff.map_to_host() as h_array_buff:\n",
137+
" for i in range(SIZE):\n",
138+
" for j in range(SIZE):\n",
139+
" h_array_buff[i * SIZE + j] = i * SIZE + j\n",
140+
" expected[i * SIZE + j] = h_array_buff[i * SIZE + j] + 10\n",
141+
" print(\"Input: \", h_array_buff)\n",
142+
"\n",
143+
" ctx.enqueue_function[add_10_2d](\n",
144+
" d_out_buff.unsafe_ptr(),\n",
145+
" d_array_buff.unsafe_ptr(),\n",
146+
" SIZE,\n",
147+
" grid_dim=BLOCKS_PER_GRID,\n",
148+
" block_dim=THREADS_PER_BLOCK,\n",
149+
" )\n",
150+
"\n",
151+
" ctx.synchronize()\n",
152+
"\n",
153+
" with d_out_buff.map_to_host() as h_out_buff:\n",
154+
" print(h_out_buff)\n",
155+
" print(expected)\n",
156+
" for i in range(SIZE * SIZE ):\n",
157+
" assert_equal(h_out_buff[i], expected[i])\n",
158+
"\n",
159+
" except e:\n",
160+
" print(e)"
161+
],
162+
"metadata": {
163+
"trusted": true,
164+
"execution": {
165+
"iopub.status.busy": "2025-05-14T12:13:26.016637Z",
166+
"iopub.execute_input": "2025-05-14T12:13:26.017309Z",
167+
"iopub.status.idle": "2025-05-14T12:13:26.022915Z",
168+
"shell.execute_reply.started": "2025-05-14T12:13:26.017280Z",
169+
"shell.execute_reply": "2025-05-14T12:13:26.022289Z"
170+
},
171+
"id": "lXVGbVNbuD3j",
172+
"outputId": "6e70e176-460a-42b4-fa78-975bc9f13559"
173+
},
174+
"outputs": [
175+
{
176+
"name": "stdout",
177+
"text": "Overwriting add_10_2d.mojo\n",
178+
"output_type": "stream"
179+
}
180+
],
181+
"execution_count": null
182+
},
183+
{
184+
"cell_type": "code",
185+
"source": [
186+
"!magic run mojo add_10_2d.mojo"
187+
],
188+
"metadata": {
189+
"trusted": true,
190+
"execution": {
191+
"iopub.status.busy": "2025-05-14T12:13:33.613285Z",
192+
"iopub.execute_input": "2025-05-14T12:13:33.613855Z",
193+
"iopub.status.idle": "2025-05-14T12:13:39.393304Z",
194+
"shell.execute_reply.started": "2025-05-14T12:13:33.613833Z",
195+
"shell.execute_reply": "2025-05-14T12:13:39.392434Z"
196+
},
197+
"id": "ruyQcZM7uD3j",
198+
"outputId": "dcbd01c5-d6d4-41c7-85d3-dbf3c6995eac"
199+
},
200+
"outputs": [
201+
{
202+
"name": "stdout",
203+
"text": "\u001b[2K\u001b[32m⠁\u001b[0m activating environment Input: HostBuffer([0.0, 1.0, 2.0, 3.0])\nHostBuffer([10.0, 11.0, 12.0, 13.0])\nHostBuffer([10.0, 11.0, 12.0, 13.0])\n",
204+
"output_type": "stream"
205+
}
206+
],
207+
"execution_count": null
208+
},
209+
{
210+
"cell_type": "code",
211+
"source": [
212+
"!magic run mojo format add_10_2d.mojo"
213+
],
214+
"metadata": {
215+
"trusted": true,
216+
"id": "xYEQBuCeuD3k"
217+
},
218+
"outputs": [],
219+
"execution_count": null
220+
}
221+
]
222+
}

gpu_puzzles/add_10_2d.mojo

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
### Add a constant 10
2+
### Implement a kernel that adds 10 to each position of 2d matrix a and stores it in out 2d matrix.
3+
4+
5+
from gpu.host import DeviceContext
6+
from memory import UnsafePointer
7+
from gpu import thread_idx, block_dim
8+
from testing import assert_equal
9+
10+
alias SIZE = 2
11+
alias BLOCKS_PER_GRID = 1
12+
alias THREADS_PER_BLOCK = (3,3)
13+
alias dtype = DType.float32
14+
15+
16+
fn add_10_2d(
17+
out: UnsafePointer[Scalar[dtype]], array: UnsafePointer[Scalar[dtype]], size: Int
18+
):
19+
tid = thread_idx.z * (block_dim.y * block_dim.x) + thread_idx.y * block_dim.x + thread_idx.x
20+
if tid < size * size:
21+
out[tid] = array[tid] + 10
22+
23+
24+
fn main():
25+
try:
26+
ctx = DeviceContext()
27+
d_array_buff = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
28+
d_out_buff = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
29+
expected = ctx.enqueue_create_host_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
30+
31+
32+
with d_array_buff.map_to_host() as h_array_buff:
33+
for i in range(SIZE):
34+
for j in range(SIZE):
35+
h_array_buff[i * SIZE + j] = i * SIZE + j
36+
expected[i * SIZE + j] = h_array_buff[i * SIZE + j] + 10
37+
print("Input: ", h_array_buff)
38+
39+
ctx.enqueue_function[add_10_2d](
40+
d_out_buff.unsafe_ptr(),
41+
d_array_buff.unsafe_ptr(),
42+
SIZE,
43+
grid_dim=BLOCKS_PER_GRID,
44+
block_dim=THREADS_PER_BLOCK,
45+
)
46+
47+
ctx.synchronize()
48+
49+
with d_out_buff.map_to_host() as h_out_buff:
50+
print(h_out_buff)
51+
print(expected)
52+
for i in range(SIZE * SIZE ):
53+
assert_equal(h_out_buff[i], expected[i])
54+
55+
except e:
56+
print(e)

0 commit comments

Comments
 (0)