Add 10 to 2D Layout tensor

ratulb · ratulb · commit 57cc36499ee6 · 2025-05-24T07:09:06.000Z
diff --git a/gpu_puzzles/add_10_2dlayout.ipynb b/gpu_puzzles/add_10_2dlayout.ipynb
@@ -0,0 +1,215 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "buOgxm25ONit"
+      },
+      "outputs": [],
+      "source": [
+        "!curl -ssL https://magic.modular.com/ | bash"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "FVZvyhRiONiw"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "os.environ['PATH'] +=':/root/.modular/bin'"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "TqFD0EK0ONiw"
+      },
+      "outputs": [],
+      "source": [
+        "!magic init gpu_puzzles --format mojoproject"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "k3Ddb6GcONiw"
+      },
+      "outputs": [],
+      "source": [
+        "%cd gpu_puzzles/"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 16,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "IaxB1auxONix",
+        "outputId": "3474befe-bbb9-459a-ddd5-e9855ca305b5"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Overwriting add_10_2dlayout.mojo\n"
+          ]
+        }
+      ],
+      "source": [
+        "%%writefile add_10_2dlayout.mojo\n",
+        "\n",
+        "### Add constant to 2D Layout tensor\n",
+        "### Implement a kernel that adds 10 to each position of 2D LayoutTensor a and stores it in 2D LayoutTensor out.\n",
+        "\n",
+        "from gpu.host import DeviceContext\n",
+        "from gpu import thread_idx\n",
+        "from layout import Layout, LayoutTensor\n",
+        "from math import iota\n",
+        "\n",
+        "\n",
+        "alias SIZE = 2\n",
+        "alias BLOCKS_PER_GRID = 1\n",
+        "alias THREADS_PER_BLOCK = (3, 3)\n",
+        "alias dtype = DType.float32\n",
+        "alias layout = Layout.row_major(SIZE, SIZE)\n",
+        "\n",
+        "\n",
+        "fn add_10_2dlayout(\n",
+        "    out: LayoutTensor[mut=True, dtype, layout],\n",
+        "    a: LayoutTensor[mut=True, dtype, layout],\n",
+        "    size: Int,\n",
+        "):\n",
+        "    row = thread_idx.y\n",
+        "    col = thread_idx.x\n",
+        "    # FILL ME IN (roughly 2 lines)\n",
+        "    if row < size and col < size:\n",
+        "        out[row, col] = a[row, col] + 10\n",
+        "\n",
+        "\n",
+        "fn main():\n",
+        "    try:\n",
+        "        ctx = DeviceContext()\n",
+        "\n",
+        "        buffer_a = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(\n",
+        "            0.0\n",
+        "        )\n",
+        "        buffer_out = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(\n",
+        "            0.0\n",
+        "        )\n",
+        "\n",
+        "        with buffer_a.map_to_host() as h_buffer_a:\n",
+        "            iota(h_buffer_a.unsafe_ptr(), SIZE * SIZE)\n",
+        "\n",
+        "        out = LayoutTensor[mut=True, dtype, layout](buffer_out)\n",
+        "        a = LayoutTensor[mut=True, dtype, layout](buffer_a)\n",
+        "\n",
+        "        ctx.enqueue_function[add_10_2dlayout](\n",
+        "            out,\n",
+        "            a,\n",
+        "            SIZE,\n",
+        "            grid_dim=(BLOCKS_PER_GRID, BLOCKS_PER_GRID),\n",
+        "            block_dim=THREADS_PER_BLOCK,\n",
+        "        )\n",
+        "\n",
+        "        ctx.synchronize()\n",
+        "\n",
+        "        with buffer_out.map_to_host() as h_buffer_out:\n",
+        "            print(h_buffer_out)\n",
+        "    except e:\n",
+        "        print(e)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 17,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "h2k9wkDaONiz",
+        "outputId": "3cb08451-43dd-4400-bab3-fcb435191c05"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[32m⠁\u001b[0m                                                                               \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2KHostBuffer([10.0, 11.0, 12.0, 13.0])\n"
+          ]
+        }
+      ],
+      "source": [
+        "!magic run mojo add_10_2dlayout.mojo"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 15,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "bSglX7bNONi0",
+        "outputId": "4f7126e7-85f3-4ef3-fed9-79673b282bc4"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[32m⠁\u001b[0m                                                                               \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2K\u001b[1mreformatted add_10_2dlayout.mojo\u001b[0m\n",
+            "\n",
+            "\u001b[1mAll done! ✨ 🍰 ✨\u001b[0m\n",
+            "\u001b[34m\u001b[1m1 file \u001b[0m\u001b[1mreformatted\u001b[0m.\n"
+          ]
+        }
+      ],
+      "source": [
+        "!magic run mojo format add_10_2dlayout.mojo"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kaggle": {
+      "accelerator": "nvidiaTeslaT4",
+      "dataSources": [],
+      "dockerImageVersionId": 31041,
+      "isGpuEnabled": true,
+      "isInternetEnabled": true,
+      "language": "python",
+      "sourceType": "notebook"
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.11"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/gpu_puzzles/add_10_2dlayout.mojo b/gpu_puzzles/add_10_2dlayout.mojo
@@ -0,0 +1,60 @@
+### Add constant to 2D Layout tensor
+### Implement a kernel that adds 10 to each position of 2D LayoutTensor a and stores it in 2D LayoutTensor out.
+
+from gpu.host import DeviceContext
+from gpu import thread_idx
+from layout import Layout, LayoutTensor
+from math import iota
+
+
+alias SIZE = 2
+alias BLOCKS_PER_GRID = 1
+alias THREADS_PER_BLOCK = (3, 3)
+alias dtype = DType.float32
+alias layout = Layout.row_major(SIZE, SIZE)
+
+
+fn add_10_2dlayout(
+    out: LayoutTensor[mut=True, dtype, layout],
+    a: LayoutTensor[mut=True, dtype, layout],
+    size: Int,
+):
+    row = thread_idx.y
+    col = thread_idx.x
+    # FILL ME IN (roughly 2 lines)
+    if row < size and col < size:
+        out[row, col] = a[row, col] + 10
+
+
+fn main():
+    try:
+        ctx = DeviceContext()
+
+        buffer_a = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(
+            0.0
+        )
+        buffer_out = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(
+            0.0
+        )
+
+        with buffer_a.map_to_host() as h_buffer_a:
+            iota(h_buffer_a.unsafe_ptr(), SIZE * SIZE)
+
+        out = LayoutTensor[mut=True, dtype, layout](buffer_out)
+        a = LayoutTensor[mut=True, dtype, layout](buffer_a)
+
+        ctx.enqueue_function[add_10_2dlayout](
+            out,
+            a,
+            SIZE,
+            grid_dim=(BLOCKS_PER_GRID, BLOCKS_PER_GRID),
+            block_dim=THREADS_PER_BLOCK,
+        )
+
+        ctx.synchronize()
+
+        with buffer_out.map_to_host() as h_buffer_out:
+            print(h_buffer_out)
+    except e:
+        print(e)
+