Layout basics

ratulb · ratulb · commit f8a4a13d9187 · 2025-05-15T02:15:04.000Z
diff --git a/gpu_puzzles/layout_basics.ipynb b/gpu_puzzles/layout_basics.ipynb
@@ -0,0 +1,288 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "!nvcc --version\n",
+        "\n",
+        "!nvidia-smi"
+      ],
+      "metadata": {
+        "id": "sOjZ5dgIdpjd",
+        "outputId": "74c722b4-7b10-4726-bd28-3a55489b04d2",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "nvcc: NVIDIA (R) Cuda compiler driver\n",
+            "Copyright (c) 2005-2024 NVIDIA Corporation\n",
+            "Built on Thu_Jun__6_02:18:23_PDT_2024\n",
+            "Cuda compilation tools, release 12.5, V12.5.82\n",
+            "Build cuda_12.5.r12.5/compiler.34385749_0\n",
+            "Thu May 15 01:02:16 2025       \n",
+            "+-----------------------------------------------------------------------------------------+\n",
+            "| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |\n",
+            "|-----------------------------------------+------------------------+----------------------+\n",
+            "| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |\n",
+            "| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |\n",
+            "|                                         |                        |               MIG M. |\n",
+            "|=========================================+========================+======================|\n",
+            "|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |\n",
+            "| N/A   40C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |\n",
+            "|                                         |                        |                  N/A |\n",
+            "+-----------------------------------------+------------------------+----------------------+\n",
+            "                                                                                         \n",
+            "+-----------------------------------------------------------------------------------------+\n",
+            "| Processes:                                                                              |\n",
+            "|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |\n",
+            "|        ID   ID                                                               Usage      |\n",
+            "|=========================================================================================|\n",
+            "|  No running processes found                                                             |\n",
+            "+-----------------------------------------------------------------------------------------+\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!curl -ssL https://magic.modular.com/ | bash"
+      ],
+      "metadata": {
+        "id": "WzmJ-O8PdtSF",
+        "outputId": "d67c2a37-de17-465d-b40c-1c3a0d3ae1e1",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "execution_count": 2,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Installing the latest version of Magic...\n",
+            "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
+            "                                 Dload  Upload   Total   Spent    Left  Speed\n",
+            "  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0\n",
+            "100 49.9M  100 49.9M    0     0  15.7M      0  0:00:03  0:00:03 --:--:-- 77.5M\n",
+            "Done. The 'magic' binary is in '/root/.modular/bin'\n",
+            "\n",
+            "Two more steps:\n",
+            "1. To use 'magic', run this command so it's in your PATH:\n",
+            "source /root/.bashrc\n",
+            "2. To build with MAX and Mojo, go to http://modul.ar/get-started\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "os.environ['PATH'] +=':/root/.modular/bin'"
+      ],
+      "metadata": {
+        "id": "gYB3L8pcd2Kd"
+      },
+      "execution_count": 3,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic init gpu_puzzles --format mojoproject"
+      ],
+      "metadata": {
+        "id": "6bLFIXq6d6ak",
+        "outputId": "8e310bc0-ea88-4a6b-8fd8-1092cf806188",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "execution_count": 4,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[32m✔ \u001b[0mCreated /content/gpu_puzzles/mojoproject.toml\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%cd gpu_puzzles/"
+      ],
+      "metadata": {
+        "id": "uXyz0qdAd-01",
+        "outputId": "641db66a-460f-4587-84a7-e3efce379e0d",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "execution_count": 5,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "/content/gpu_puzzles\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%writefile layout_basics.mojo\n",
+        "from gpu.host import DeviceContext\n",
+        "from layout import Layout, LayoutTensor\n",
+        "\n",
+        "alias HEIGHT = 2\n",
+        "alias WIDTH = 3\n",
+        "alias dtype = DType.float32\n",
+        "alias layout = Layout.row_major(HEIGHT, WIDTH)\n",
+        "alias BLOCKS_PER_GRID = 1\n",
+        "alias THREADS_PER_BLOCK = 1\n",
+        "\n",
+        "\n",
+        "fn kernel[\n",
+        "    dtype: DType, layout: Layout\n",
+        "](tensor: LayoutTensor[mut=True, dtype, layout]):\n",
+        "    print(\"Before\\n\")\n",
+        "    print(tensor)\n",
+        "    tensor[0, 0] += 1.0\n",
+        "    print()\n",
+        "    print(\"After\\n\")\n",
+        "    print(tensor)\n",
+        "\n",
+        "\n",
+        "def main():\n",
+        "    ctx = DeviceContext(api=\"cuda\")\n",
+        "    cpu_ctx = DeviceContext(api=\"cpu\")\n",
+        "    buffer = ctx.enqueue_create_buffer[dtype](HEIGHT * WIDTH).enqueue_fill(0)\n",
+        "    cpu_buffer = cpu_ctx.enqueue_create_host_buffer[dtype](HEIGHT * WIDTH)\n",
+        "\n",
+        "    for i in range(HEIGHT * WIDTH):\n",
+        "        cpu_buffer[i] = i**2\n",
+        "\n",
+        "    cpu_buffer.enqueue_copy_to(buffer)\n",
+        "\n",
+        "    tensor = LayoutTensor[mut=True, dtype, layout](buffer.unsafe_ptr())\n",
+        "\n",
+        "    ctx.enqueue_function[kernel[dtype, layout]](\n",
+        "        tensor, grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK\n",
+        "    )\n",
+        "\n",
+        "    ctx.synchronize()\n",
+        "\n",
+        "    print(ctx.name())\n",
+        "    print(ctx.api())\n",
+        "    print(cpu_ctx.api())\n",
+        "    cpu_buffer.unsafe_ptr()[] = 98.0\n",
+        "    print(cpu_buffer)\n"
+      ],
+      "metadata": {
+        "id": "BIjAgNXPeDr0",
+        "outputId": "142557b9-746a-4c7e-e8eb-483bbe717d91",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "execution_count": 58,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Overwriting layout_basics.mojo\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic run mojo layout_basics.mojo"
+      ],
+      "metadata": {
+        "id": "giCcT7uWeIql",
+        "outputId": "b0718f7c-19d4-479d-fd64-88727c5b0232",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "execution_count": 59,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[32m⠁\u001b[0m                                                                               \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2KBefore\n",
+            "\n",
+            "0.0 1.0 4.0 \n",
+            "9.0 16.0 25.0 \n",
+            "\n",
+            "After\n",
+            "\n",
+            "1.0 1.0 4.0 \n",
+            "9.0 16.0 25.0 \n",
+            "Tesla T4\n",
+            "cuda\n",
+            "cpu\n",
+            "HostBuffer([98.0, 1.0, 4.0, 9.0, 16.0, 25.0])\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic run mojo format layout_basics.mojo"
+      ],
+      "metadata": {
+        "id": "bdshNEPLeKes",
+        "outputId": "75a9c56a-886d-4126-e225-e2cc77abc5dc",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "execution_count": 57,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[32m⠁\u001b[0m                                                                               \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2K\u001b[1mreformatted layout_basics.mojo\u001b[0m\n",
+            "\n",
+            "\u001b[1mAll done! ✨ 🍰 ✨\u001b[0m\n",
+            "\u001b[34m\u001b[1m1 file \u001b[0m\u001b[1mreformatted\u001b[0m.\n"
+          ]
+        }
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "name": "Welcome To Colab",
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "accelerator": "GPU"
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/gpu_puzzles/layout_basics.mojo b/gpu_puzzles/layout_basics.mojo
@@ -0,0 +1,47 @@
+from gpu.host import DeviceContext
+from layout import Layout, LayoutTensor
+
+alias HEIGHT = 2
+alias WIDTH = 3
+alias dtype = DType.float32
+alias layout = Layout.row_major(HEIGHT, WIDTH)
+alias BLOCKS_PER_GRID = 1
+alias THREADS_PER_BLOCK = 1
+
+
+fn kernel[
+    dtype: DType, layout: Layout
+](tensor: LayoutTensor[mut=True, dtype, layout]):
+    print("Before\n")
+    print(tensor)
+    tensor[0, 0] += 1.0
+    print()
+    print("After\n")
+    print(tensor)
+
+
+def main():
+    ctx = DeviceContext(api="cuda")
+    cpu_ctx = DeviceContext(api="cpu")
+    buffer = ctx.enqueue_create_buffer[dtype](HEIGHT * WIDTH).enqueue_fill(0)
+    cpu_buffer = cpu_ctx.enqueue_create_host_buffer[dtype](HEIGHT * WIDTH)
+
+    for i in range(HEIGHT * WIDTH):
+        cpu_buffer[i] = i**2
+
+    cpu_buffer.enqueue_copy_to(buffer)
+
+    tensor = LayoutTensor[mut=True, dtype, layout](buffer.unsafe_ptr())
+
+    ctx.enqueue_function[kernel[dtype, layout]](
+        tensor, grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK
+    )
+
+    ctx.synchronize()
+
+    print(ctx.name())
+    print(ctx.api())
+    print(cpu_ctx.api())
+    cpu_buffer.unsafe_ptr()[] = 98.0
+    print(cpu_buffer)
+