Skip to content

Commit 7c13bcd

Browse files
committed
Boundary check
1 parent 3deae99 commit 7c13bcd

File tree

2 files changed

+253
-0
lines changed

2 files changed

+253
-0
lines changed
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"source": [
6+
"!curl -ssL https://magic.modular.com/ | bash"
7+
],
8+
"metadata": {
9+
"id": "A8X6phvz7ZoQ"
10+
},
11+
"execution_count": null,
12+
"outputs": []
13+
},
14+
{
15+
"cell_type": "code",
16+
"source": [
17+
"import os\n",
18+
"os.environ['PATH'] += ':/root/.modular/bin'"
19+
],
20+
"metadata": {
21+
"id": "n7zS_6gK7fnB"
22+
},
23+
"execution_count": null,
24+
"outputs": []
25+
},
26+
{
27+
"cell_type": "code",
28+
"source": [
29+
"!magic init gpu_puzzles --format mojoproject"
30+
],
31+
"metadata": {
32+
"id": "Zlg5BNMn7j64"
33+
},
34+
"execution_count": null,
35+
"outputs": []
36+
},
37+
{
38+
"cell_type": "code",
39+
"source": [
40+
"%cd gpu_puzzles/"
41+
],
42+
"metadata": {
43+
"id": "mO77-mj17lsA"
44+
},
45+
"execution_count": null,
46+
"outputs": []
47+
},
48+
{
49+
"cell_type": "code",
50+
"source": [
51+
"%%writefile add_10_with_guard.mojo\n",
52+
"\n",
53+
"### Add 10\n",
54+
"### Implement a kernel that adds 10 to each position of vector a and stores it in vector out.\n",
55+
"### More threads than data — guard against out-of-bounds access.\n",
56+
"\n",
57+
"from gpu.host import DeviceContext\n",
58+
"from memory import UnsafePointer\n",
59+
"from gpu import thread_idx, block_dim, block_idx\n",
60+
"from testing import assert_equal\n",
61+
"\n",
62+
"alias SIZE = 4\n",
63+
"alias BLOCKS_PER_GRID = 1\n",
64+
"alias THREADS_PER_BLOCK = (8, 1)\n",
65+
"alias dtype = DType.float32\n",
66+
"\n",
67+
"\n",
68+
"fn add_10_with_guard(\n",
69+
" out: UnsafePointer[Scalar[dtype]], array: UnsafePointer[Scalar[dtype]]\n",
70+
"):\n",
71+
" tid = (\n",
72+
" thread_idx.z * (block_dim.y * block_dim.x)\n",
73+
" + thread_idx.y * block_dim.x\n",
74+
" + thread_idx.x\n",
75+
" )\n",
76+
"\n",
77+
" if tid < SIZE:\n",
78+
" out[tid] = array[tid] + 10\n",
79+
"\n",
80+
"\n",
81+
"fn main() raises:\n",
82+
" ctx = DeviceContext()\n",
83+
" d_array_buff = ctx.enqueue_create_buffer[dtype](SIZE)\n",
84+
" d_out_buff = ctx.enqueue_create_buffer[dtype](SIZE)\n",
85+
" expected = ctx.enqueue_create_host_buffer[dtype](SIZE)\n",
86+
" _ = d_out_buff.enqueue_fill(0)\n",
87+
"\n",
88+
" with d_array_buff.map_to_host() as h_array_buff:\n",
89+
" for i in range(SIZE):\n",
90+
" h_array_buff[i] = i\n",
91+
"\n",
92+
" ctx.enqueue_function[add_10_with_guard](\n",
93+
" d_out_buff.unsafe_ptr(),\n",
94+
" d_array_buff.unsafe_ptr(),\n",
95+
" grid_dim=BLOCKS_PER_GRID,\n",
96+
" block_dim=THREADS_PER_BLOCK,\n",
97+
" )\n",
98+
"\n",
99+
" ctx.synchronize()\n",
100+
"\n",
101+
" for i in range(SIZE):\n",
102+
" expected[i] = i + 10\n",
103+
"\n",
104+
" print(expected)\n",
105+
"\n",
106+
" with d_out_buff.map_to_host() as h_out_buff:\n",
107+
" print(h_out_buff)\n",
108+
" for i in range(SIZE):\n",
109+
" assert_equal(h_out_buff[i], expected[i])\n"
110+
],
111+
"metadata": {
112+
"id": "UT3V1O2M7txw",
113+
"colab": {
114+
"base_uri": "https://localhost:8080/"
115+
},
116+
"outputId": "dd7e127f-d029-4137-af6a-956caee52453"
117+
},
118+
"execution_count": 55,
119+
"outputs": [
120+
{
121+
"output_type": "stream",
122+
"name": "stdout",
123+
"text": [
124+
"Overwriting add_10_with_guard.mojo\n"
125+
]
126+
}
127+
]
128+
},
129+
{
130+
"cell_type": "code",
131+
"source": [
132+
"!magic run mojo add_10_with_guard.mojo"
133+
],
134+
"metadata": {
135+
"id": "CkjRGISm7y1Q",
136+
"outputId": "1b98779b-fac1-45c9-8e4a-8bce608410e3",
137+
"colab": {
138+
"base_uri": "https://localhost:8080/"
139+
}
140+
},
141+
"execution_count": 56,
142+
"outputs": [
143+
{
144+
"output_type": "stream",
145+
"name": "stdout",
146+
"text": [
147+
"\u001b[32m⠁\u001b[0m \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment \r\u001b[2KHostBuffer([10.0, 11.0, 12.0, 13.0])\n",
148+
"HostBuffer([10.0, 11.0, 12.0, 13.0])\n"
149+
]
150+
}
151+
]
152+
},
153+
{
154+
"cell_type": "code",
155+
"source": [
156+
"!magic run mojo format add_10_with_guard.mojo"
157+
],
158+
"metadata": {
159+
"colab": {
160+
"base_uri": "https://localhost:8080/"
161+
},
162+
"id": "Cc2XVTrevpy5",
163+
"outputId": "17653df8-73db-4a76-c642-bd497ee58cf4"
164+
},
165+
"execution_count": 52,
166+
"outputs": [
167+
{
168+
"output_type": "stream",
169+
"name": "stdout",
170+
"text": [
171+
"\u001b[32m⠁\u001b[0m \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment \r\u001b[2K\u001b[1mreformatted add_10_with_guard.mojo\u001b[0m\n",
172+
"\n",
173+
"\u001b[1mAll done! ✨ 🍰 ✨\u001b[0m\n",
174+
"\u001b[34m\u001b[1m1 file \u001b[0m\u001b[1mreformatted\u001b[0m.\n"
175+
]
176+
}
177+
]
178+
}
179+
],
180+
"metadata": {
181+
"colab": {
182+
"name": "Welcome To Colab",
183+
"provenance": [],
184+
"gpuType": "T4"
185+
},
186+
"kernelspec": {
187+
"display_name": "Python 3",
188+
"name": "python3"
189+
},
190+
"accelerator": "GPU"
191+
},
192+
"nbformat": 4,
193+
"nbformat_minor": 0
194+
}

gpu_puzzles/add_10_with_guard.mojo

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
2+
### Add 10
3+
### Implement a kernel that adds 10 to each position of vector a and stores it in vector out.
4+
### More threads than data — guard against out-of-bounds access.
5+
6+
from gpu.host import DeviceContext
7+
from memory import UnsafePointer
8+
from gpu import thread_idx, block_dim, block_idx
9+
from testing import assert_equal
10+
11+
alias SIZE = 4
12+
alias BLOCKS_PER_GRID = 1
13+
alias THREADS_PER_BLOCK = (8, 1)
14+
alias dtype = DType.float32
15+
16+
17+
fn add_10_with_guard(
18+
out: UnsafePointer[Scalar[dtype]], array: UnsafePointer[Scalar[dtype]]
19+
):
20+
tid = (
21+
thread_idx.z * (block_dim.y * block_dim.x)
22+
+ thread_idx.y * block_dim.x
23+
+ thread_idx.x
24+
)
25+
26+
if tid < SIZE:
27+
out[tid] = array[tid] + 10
28+
29+
30+
fn main() raises:
31+
ctx = DeviceContext()
32+
d_array_buff = ctx.enqueue_create_buffer[dtype](SIZE)
33+
d_out_buff = ctx.enqueue_create_buffer[dtype](SIZE)
34+
expected = ctx.enqueue_create_host_buffer[dtype](SIZE)
35+
_ = d_out_buff.enqueue_fill(0)
36+
37+
with d_array_buff.map_to_host() as h_array_buff:
38+
for i in range(SIZE):
39+
h_array_buff[i] = i
40+
41+
ctx.enqueue_function[add_10_with_guard](
42+
d_out_buff.unsafe_ptr(),
43+
d_array_buff.unsafe_ptr(),
44+
grid_dim=BLOCKS_PER_GRID,
45+
block_dim=THREADS_PER_BLOCK,
46+
)
47+
48+
ctx.synchronize()
49+
50+
for i in range(SIZE):
51+
expected[i] = i + 10
52+
53+
print(expected)
54+
55+
with d_out_buff.map_to_host() as h_out_buff:
56+
print(h_out_buff)
57+
for i in range(SIZE):
58+
assert_equal(h_out_buff[i], expected[i])
59+

0 commit comments

Comments
 (0)