Boundary check

ratulb · ratulb · commit 7c13bcd93dcc · 2025-05-14T10:01:42.000Z
diff --git a/gpu_puzzles/add_10_with_guard.ipynb b/gpu_puzzles/add_10_with_guard.ipynb
@@ -0,0 +1,194 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "!curl -ssL https://magic.modular.com/ | bash"
+      ],
+      "metadata": {
+        "id": "A8X6phvz7ZoQ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "os.environ['PATH'] += ':/root/.modular/bin'"
+      ],
+      "metadata": {
+        "id": "n7zS_6gK7fnB"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic init gpu_puzzles --format mojoproject"
+      ],
+      "metadata": {
+        "id": "Zlg5BNMn7j64"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%cd gpu_puzzles/"
+      ],
+      "metadata": {
+        "id": "mO77-mj17lsA"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%writefile add_10_with_guard.mojo\n",
+        "\n",
+        "### Add 10\n",
+        "### Implement a kernel that adds 10 to each position of vector a and stores it in vector out.\n",
+        "### More threads than data — guard against out-of-bounds access.\n",
+        "\n",
+        "from gpu.host import DeviceContext\n",
+        "from memory import UnsafePointer\n",
+        "from gpu import thread_idx, block_dim, block_idx\n",
+        "from testing import assert_equal\n",
+        "\n",
+        "alias SIZE = 4\n",
+        "alias BLOCKS_PER_GRID = 1\n",
+        "alias THREADS_PER_BLOCK = (8, 1)\n",
+        "alias dtype = DType.float32\n",
+        "\n",
+        "\n",
+        "fn add_10_with_guard(\n",
+        "    out: UnsafePointer[Scalar[dtype]], array: UnsafePointer[Scalar[dtype]]\n",
+        "):\n",
+        "    tid = (\n",
+        "        thread_idx.z * (block_dim.y * block_dim.x)\n",
+        "        + thread_idx.y * block_dim.x\n",
+        "        + thread_idx.x\n",
+        "    )\n",
+        "\n",
+        "    if tid < SIZE:\n",
+        "        out[tid] = array[tid] + 10\n",
+        "\n",
+        "\n",
+        "fn main() raises:\n",
+        "    ctx = DeviceContext()\n",
+        "    d_array_buff = ctx.enqueue_create_buffer[dtype](SIZE)\n",
+        "    d_out_buff = ctx.enqueue_create_buffer[dtype](SIZE)\n",
+        "    expected = ctx.enqueue_create_host_buffer[dtype](SIZE)\n",
+        "    _ = d_out_buff.enqueue_fill(0)\n",
+        "\n",
+        "    with d_array_buff.map_to_host() as h_array_buff:\n",
+        "        for i in range(SIZE):\n",
+        "            h_array_buff[i] = i\n",
+        "\n",
+        "    ctx.enqueue_function[add_10_with_guard](\n",
+        "        d_out_buff.unsafe_ptr(),\n",
+        "        d_array_buff.unsafe_ptr(),\n",
+        "        grid_dim=BLOCKS_PER_GRID,\n",
+        "        block_dim=THREADS_PER_BLOCK,\n",
+        "    )\n",
+        "\n",
+        "    ctx.synchronize()\n",
+        "\n",
+        "    for i in range(SIZE):\n",
+        "        expected[i] = i + 10\n",
+        "\n",
+        "    print(expected)\n",
+        "\n",
+        "    with d_out_buff.map_to_host() as h_out_buff:\n",
+        "        print(h_out_buff)\n",
+        "        for i in range(SIZE):\n",
+        "            assert_equal(h_out_buff[i], expected[i])\n"
+      ],
+      "metadata": {
+        "id": "UT3V1O2M7txw",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "dd7e127f-d029-4137-af6a-956caee52453"
+      },
+      "execution_count": 55,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Overwriting add_10_with_guard.mojo\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic run mojo add_10_with_guard.mojo"
+      ],
+      "metadata": {
+        "id": "CkjRGISm7y1Q",
+        "outputId": "1b98779b-fac1-45c9-8e4a-8bce608410e3",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "execution_count": 56,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[32m⠁\u001b[0m                                                                               \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2KHostBuffer([10.0, 11.0, 12.0, 13.0])\n",
+            "HostBuffer([10.0, 11.0, 12.0, 13.0])\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic run mojo format add_10_with_guard.mojo"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Cc2XVTrevpy5",
+        "outputId": "17653df8-73db-4a76-c642-bd497ee58cf4"
+      },
+      "execution_count": 52,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[32m⠁\u001b[0m                                                                               \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2K\u001b[1mreformatted add_10_with_guard.mojo\u001b[0m\n",
+            "\n",
+            "\u001b[1mAll done! ✨ 🍰 ✨\u001b[0m\n",
+            "\u001b[34m\u001b[1m1 file \u001b[0m\u001b[1mreformatted\u001b[0m.\n"
+          ]
+        }
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "name": "Welcome To Colab",
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "accelerator": "GPU"
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/gpu_puzzles/add_10_with_guard.mojo b/gpu_puzzles/add_10_with_guard.mojo
@@ -0,0 +1,59 @@
+
+### Add 10
+### Implement a kernel that adds 10 to each position of vector a and stores it in vector out.
+### More threads than data — guard against out-of-bounds access.
+
+from gpu.host import DeviceContext
+from memory import UnsafePointer
+from gpu import thread_idx, block_dim, block_idx
+from testing import assert_equal
+
+alias SIZE = 4
+alias BLOCKS_PER_GRID = 1
+alias THREADS_PER_BLOCK = (8, 1)
+alias dtype = DType.float32
+
+
+fn add_10_with_guard(
+    out: UnsafePointer[Scalar[dtype]], array: UnsafePointer[Scalar[dtype]]
+):
+    tid = (
+        thread_idx.z * (block_dim.y * block_dim.x)
+        + thread_idx.y * block_dim.x
+        + thread_idx.x
+    )
+
+    if tid < SIZE:
+        out[tid] = array[tid] + 10
+
+
+fn main() raises:
+    ctx = DeviceContext()
+    d_array_buff = ctx.enqueue_create_buffer[dtype](SIZE)
+    d_out_buff = ctx.enqueue_create_buffer[dtype](SIZE)
+    expected = ctx.enqueue_create_host_buffer[dtype](SIZE)
+    _ = d_out_buff.enqueue_fill(0)
+
+    with d_array_buff.map_to_host() as h_array_buff:
+        for i in range(SIZE):
+            h_array_buff[i] = i
+
+    ctx.enqueue_function[add_10_with_guard](
+        d_out_buff.unsafe_ptr(),
+        d_array_buff.unsafe_ptr(),
+        grid_dim=BLOCKS_PER_GRID,
+        block_dim=THREADS_PER_BLOCK,
+    )
+
+    ctx.synchronize()
+
+    for i in range(SIZE):
+        expected[i] = i + 10
+
+    print(expected)
+
+    with d_out_buff.map_to_host() as h_out_buff:
+        print(h_out_buff)
+        for i in range(SIZE):
+            assert_equal(h_out_buff[i], expected[i])
+