Skip to content

Commit f8a4a13

Browse files
committed
Layout basics
1 parent 8ed8707 commit f8a4a13

File tree

2 files changed

+335
-0
lines changed

2 files changed

+335
-0
lines changed

gpu_puzzles/layout_basics.ipynb

Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"source": [
6+
"!nvcc --version\n",
7+
"\n",
8+
"!nvidia-smi"
9+
],
10+
"metadata": {
11+
"id": "sOjZ5dgIdpjd",
12+
"outputId": "74c722b4-7b10-4726-bd28-3a55489b04d2",
13+
"colab": {
14+
"base_uri": "https://localhost:8080/"
15+
}
16+
},
17+
"execution_count": 1,
18+
"outputs": [
19+
{
20+
"output_type": "stream",
21+
"name": "stdout",
22+
"text": [
23+
"nvcc: NVIDIA (R) Cuda compiler driver\n",
24+
"Copyright (c) 2005-2024 NVIDIA Corporation\n",
25+
"Built on Thu_Jun__6_02:18:23_PDT_2024\n",
26+
"Cuda compilation tools, release 12.5, V12.5.82\n",
27+
"Build cuda_12.5.r12.5/compiler.34385749_0\n",
28+
"Thu May 15 01:02:16 2025 \n",
29+
"+-----------------------------------------------------------------------------------------+\n",
30+
"| NVIDIA-SMI 550.54.15 Driver Version: 550.54.15 CUDA Version: 12.4 |\n",
31+
"|-----------------------------------------+------------------------+----------------------+\n",
32+
"| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
33+
"| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n",
34+
"| | | MIG M. |\n",
35+
"|=========================================+========================+======================|\n",
36+
"| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n",
37+
"| N/A 40C P8 9W / 70W | 0MiB / 15360MiB | 0% Default |\n",
38+
"| | | N/A |\n",
39+
"+-----------------------------------------+------------------------+----------------------+\n",
40+
" \n",
41+
"+-----------------------------------------------------------------------------------------+\n",
42+
"| Processes: |\n",
43+
"| GPU GI CI PID Type Process name GPU Memory |\n",
44+
"| ID ID Usage |\n",
45+
"|=========================================================================================|\n",
46+
"| No running processes found |\n",
47+
"+-----------------------------------------------------------------------------------------+\n"
48+
]
49+
}
50+
]
51+
},
52+
{
53+
"cell_type": "code",
54+
"source": [
55+
"!curl -ssL https://magic.modular.com/ | bash"
56+
],
57+
"metadata": {
58+
"id": "WzmJ-O8PdtSF",
59+
"outputId": "d67c2a37-de17-465d-b40c-1c3a0d3ae1e1",
60+
"colab": {
61+
"base_uri": "https://localhost:8080/"
62+
}
63+
},
64+
"execution_count": 2,
65+
"outputs": [
66+
{
67+
"output_type": "stream",
68+
"name": "stdout",
69+
"text": [
70+
"Installing the latest version of Magic...\n",
71+
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
72+
" Dload Upload Total Spent Left Speed\n",
73+
" 0 0 0 0 0 0 0 0 --:--:-- 0:00:01 --:--:-- 0\n",
74+
"100 49.9M 100 49.9M 0 0 15.7M 0 0:00:03 0:00:03 --:--:-- 77.5M\n",
75+
"Done. The 'magic' binary is in '/root/.modular/bin'\n",
76+
"\n",
77+
"Two more steps:\n",
78+
"1. To use 'magic', run this command so it's in your PATH:\n",
79+
"source /root/.bashrc\n",
80+
"2. To build with MAX and Mojo, go to http://modul.ar/get-started\n"
81+
]
82+
}
83+
]
84+
},
85+
{
86+
"cell_type": "code",
87+
"source": [
88+
"import os\n",
89+
"os.environ['PATH'] +=':/root/.modular/bin'"
90+
],
91+
"metadata": {
92+
"id": "gYB3L8pcd2Kd"
93+
},
94+
"execution_count": 3,
95+
"outputs": []
96+
},
97+
{
98+
"cell_type": "code",
99+
"source": [
100+
"!magic init gpu_puzzles --format mojoproject"
101+
],
102+
"metadata": {
103+
"id": "6bLFIXq6d6ak",
104+
"outputId": "8e310bc0-ea88-4a6b-8fd8-1092cf806188",
105+
"colab": {
106+
"base_uri": "https://localhost:8080/"
107+
}
108+
},
109+
"execution_count": 4,
110+
"outputs": [
111+
{
112+
"output_type": "stream",
113+
"name": "stdout",
114+
"text": [
115+
"\u001b[32m✔ \u001b[0mCreated /content/gpu_puzzles/mojoproject.toml\n"
116+
]
117+
}
118+
]
119+
},
120+
{
121+
"cell_type": "code",
122+
"source": [
123+
"%cd gpu_puzzles/"
124+
],
125+
"metadata": {
126+
"id": "uXyz0qdAd-01",
127+
"outputId": "641db66a-460f-4587-84a7-e3efce379e0d",
128+
"colab": {
129+
"base_uri": "https://localhost:8080/"
130+
}
131+
},
132+
"execution_count": 5,
133+
"outputs": [
134+
{
135+
"output_type": "stream",
136+
"name": "stdout",
137+
"text": [
138+
"/content/gpu_puzzles\n"
139+
]
140+
}
141+
]
142+
},
143+
{
144+
"cell_type": "code",
145+
"source": [
146+
"%%writefile layout_basics.mojo\n",
147+
"from gpu.host import DeviceContext\n",
148+
"from layout import Layout, LayoutTensor\n",
149+
"\n",
150+
"alias HEIGHT = 2\n",
151+
"alias WIDTH = 3\n",
152+
"alias dtype = DType.float32\n",
153+
"alias layout = Layout.row_major(HEIGHT, WIDTH)\n",
154+
"alias BLOCKS_PER_GRID = 1\n",
155+
"alias THREADS_PER_BLOCK = 1\n",
156+
"\n",
157+
"\n",
158+
"fn kernel[\n",
159+
" dtype: DType, layout: Layout\n",
160+
"](tensor: LayoutTensor[mut=True, dtype, layout]):\n",
161+
" print(\"Before\\n\")\n",
162+
" print(tensor)\n",
163+
" tensor[0, 0] += 1.0\n",
164+
" print()\n",
165+
" print(\"After\\n\")\n",
166+
" print(tensor)\n",
167+
"\n",
168+
"\n",
169+
"def main():\n",
170+
" ctx = DeviceContext(api=\"cuda\")\n",
171+
" cpu_ctx = DeviceContext(api=\"cpu\")\n",
172+
" buffer = ctx.enqueue_create_buffer[dtype](HEIGHT * WIDTH).enqueue_fill(0)\n",
173+
" cpu_buffer = cpu_ctx.enqueue_create_host_buffer[dtype](HEIGHT * WIDTH)\n",
174+
"\n",
175+
" for i in range(HEIGHT * WIDTH):\n",
176+
" cpu_buffer[i] = i**2\n",
177+
"\n",
178+
" cpu_buffer.enqueue_copy_to(buffer)\n",
179+
"\n",
180+
" tensor = LayoutTensor[mut=True, dtype, layout](buffer.unsafe_ptr())\n",
181+
"\n",
182+
" ctx.enqueue_function[kernel[dtype, layout]](\n",
183+
" tensor, grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK\n",
184+
" )\n",
185+
"\n",
186+
" ctx.synchronize()\n",
187+
"\n",
188+
" print(ctx.name())\n",
189+
" print(ctx.api())\n",
190+
" print(cpu_ctx.api())\n",
191+
" cpu_buffer.unsafe_ptr()[] = 98.0\n",
192+
" print(cpu_buffer)\n"
193+
],
194+
"metadata": {
195+
"id": "BIjAgNXPeDr0",
196+
"outputId": "142557b9-746a-4c7e-e8eb-483bbe717d91",
197+
"colab": {
198+
"base_uri": "https://localhost:8080/"
199+
}
200+
},
201+
"execution_count": 58,
202+
"outputs": [
203+
{
204+
"output_type": "stream",
205+
"name": "stdout",
206+
"text": [
207+
"Overwriting layout_basics.mojo\n"
208+
]
209+
}
210+
]
211+
},
212+
{
213+
"cell_type": "code",
214+
"source": [
215+
"!magic run mojo layout_basics.mojo"
216+
],
217+
"metadata": {
218+
"id": "giCcT7uWeIql",
219+
"outputId": "b0718f7c-19d4-479d-fd64-88727c5b0232",
220+
"colab": {
221+
"base_uri": "https://localhost:8080/"
222+
}
223+
},
224+
"execution_count": 59,
225+
"outputs": [
226+
{
227+
"output_type": "stream",
228+
"name": "stdout",
229+
"text": [
230+
"\u001b[32m⠁\u001b[0m \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment \r\u001b[2KBefore\n",
231+
"\n",
232+
"0.0 1.0 4.0 \n",
233+
"9.0 16.0 25.0 \n",
234+
"\n",
235+
"After\n",
236+
"\n",
237+
"1.0 1.0 4.0 \n",
238+
"9.0 16.0 25.0 \n",
239+
"Tesla T4\n",
240+
"cuda\n",
241+
"cpu\n",
242+
"HostBuffer([98.0, 1.0, 4.0, 9.0, 16.0, 25.0])\n"
243+
]
244+
}
245+
]
246+
},
247+
{
248+
"cell_type": "code",
249+
"source": [
250+
"!magic run mojo format layout_basics.mojo"
251+
],
252+
"metadata": {
253+
"id": "bdshNEPLeKes",
254+
"outputId": "75a9c56a-886d-4126-e225-e2cc77abc5dc",
255+
"colab": {
256+
"base_uri": "https://localhost:8080/"
257+
}
258+
},
259+
"execution_count": 57,
260+
"outputs": [
261+
{
262+
"output_type": "stream",
263+
"name": "stdout",
264+
"text": [
265+
"\u001b[32m⠁\u001b[0m \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment \r\u001b[2K\u001b[1mreformatted layout_basics.mojo\u001b[0m\n",
266+
"\n",
267+
"\u001b[1mAll done! ✨ 🍰 ✨\u001b[0m\n",
268+
"\u001b[34m\u001b[1m1 file \u001b[0m\u001b[1mreformatted\u001b[0m.\n"
269+
]
270+
}
271+
]
272+
}
273+
],
274+
"metadata": {
275+
"colab": {
276+
"name": "Welcome To Colab",
277+
"provenance": [],
278+
"gpuType": "T4"
279+
},
280+
"kernelspec": {
281+
"display_name": "Python 3",
282+
"name": "python3"
283+
},
284+
"accelerator": "GPU"
285+
},
286+
"nbformat": 4,
287+
"nbformat_minor": 0
288+
}

gpu_puzzles/layout_basics.mojo

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
from gpu.host import DeviceContext
2+
from layout import Layout, LayoutTensor
3+
4+
alias HEIGHT = 2
5+
alias WIDTH = 3
6+
alias dtype = DType.float32
7+
alias layout = Layout.row_major(HEIGHT, WIDTH)
8+
alias BLOCKS_PER_GRID = 1
9+
alias THREADS_PER_BLOCK = 1
10+
11+
12+
fn kernel[
13+
dtype: DType, layout: Layout
14+
](tensor: LayoutTensor[mut=True, dtype, layout]):
15+
print("Before\n")
16+
print(tensor)
17+
tensor[0, 0] += 1.0
18+
print()
19+
print("After\n")
20+
print(tensor)
21+
22+
23+
def main():
24+
ctx = DeviceContext(api="cuda")
25+
cpu_ctx = DeviceContext(api="cpu")
26+
buffer = ctx.enqueue_create_buffer[dtype](HEIGHT * WIDTH).enqueue_fill(0)
27+
cpu_buffer = cpu_ctx.enqueue_create_host_buffer[dtype](HEIGHT * WIDTH)
28+
29+
for i in range(HEIGHT * WIDTH):
30+
cpu_buffer[i] = i**2
31+
32+
cpu_buffer.enqueue_copy_to(buffer)
33+
34+
tensor = LayoutTensor[mut=True, dtype, layout](buffer.unsafe_ptr())
35+
36+
ctx.enqueue_function[kernel[dtype, layout]](
37+
tensor, grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK
38+
)
39+
40+
ctx.synchronize()
41+
42+
print(ctx.name())
43+
print(ctx.api())
44+
print(cpu_ctx.api())
45+
cpu_buffer.unsafe_ptr()[] = 98.0
46+
print(cpu_buffer)
47+

0 commit comments

Comments
 (0)