Add constant to 2nd tensor

ratulb · ratulb · commit 8ed87079e967 · 2025-05-14T12:23:49.000Z
diff --git a/gpu_puzzles/add_10_2d.ipynb b/gpu_puzzles/add_10_2d.ipynb
@@ -0,0 +1,222 @@
+{
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.11.11",
+      "mimetype": "text/x-python",
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "pygments_lexer": "ipython3",
+      "nbconvert_exporter": "python",
+      "file_extension": ".py"
+    },
+    "kaggle": {
+      "accelerator": "nvidiaTeslaT4",
+      "dataSources": [],
+      "dockerImageVersionId": 31011,
+      "isInternetEnabled": true,
+      "language": "python",
+      "sourceType": "notebook",
+      "isGpuEnabled": true
+    },
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "accelerator": "GPU"
+  },
+  "nbformat_minor": 0,
+  "nbformat": 4,
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "!nvcc --version\n",
+        "\n",
+        "!nvidia-smi"
+      ],
+      "metadata": {
+        "trusted": true,
+        "id": "6V0kOh2GuD3g"
+      },
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!curl -ssL https://magic.modular.com/ | bash"
+      ],
+      "metadata": {
+        "trusted": true,
+        "id": "UYmfAndVuD3h"
+      },
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "os.environ['PATH'] +=':/root/.modular/bin'\n",
+        "\n"
+      ],
+      "metadata": {
+        "trusted": true,
+        "id": "TUOeZZECuD3i"
+      },
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic init gpu_puzzles --format mojoproject"
+      ],
+      "metadata": {
+        "trusted": true,
+        "id": "rDmQeBRVuD3i"
+      },
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%cd gpu_puzzles/"
+      ],
+      "metadata": {
+        "trusted": true,
+        "id": "tNMsV7VwuD3i"
+      },
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%writefile add_10_2d.mojo\n",
+        "### Add a constant 10\n",
+        "### Implement a kernel that adds 10 to each position of 2d matrix a and stores it in out 2d matrix.\n",
+        "\n",
+        "\n",
+        "from gpu.host import DeviceContext\n",
+        "from memory import UnsafePointer\n",
+        "from gpu import thread_idx, block_dim\n",
+        "from testing import assert_equal\n",
+        "\n",
+        "alias SIZE = 2\n",
+        "alias BLOCKS_PER_GRID = 1\n",
+        "alias THREADS_PER_BLOCK = (3,3)\n",
+        "alias dtype = DType.float32\n",
+        "\n",
+        "\n",
+        "fn add_10_2d(\n",
+        "    out: UnsafePointer[Scalar[dtype]], array: UnsafePointer[Scalar[dtype]], size: Int\n",
+        "):\n",
+        "    tid = thread_idx.z * (block_dim.y * block_dim.x) + thread_idx.y * block_dim.x + thread_idx.x\n",
+        "    if tid < size * size:\n",
+        "        out[tid] = array[tid] + 10\n",
+        "\n",
+        "\n",
+        "fn main():\n",
+        "  try:\n",
+        "    ctx = DeviceContext()\n",
+        "    d_array_buff = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)\n",
+        "    d_out_buff = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)\n",
+        "    expected = ctx.enqueue_create_host_buffer[dtype](SIZE * SIZE).enqueue_fill(0)\n",
+        "\n",
+        "\n",
+        "    with d_array_buff.map_to_host() as h_array_buff:\n",
+        "        for i in range(SIZE):\n",
+        "            for j in range(SIZE):\n",
+        "                h_array_buff[i * SIZE + j] = i * SIZE + j\n",
+        "                expected[i * SIZE + j] = h_array_buff[i * SIZE + j] + 10\n",
+        "        print(\"Input: \", h_array_buff)\n",
+        "\n",
+        "    ctx.enqueue_function[add_10_2d](\n",
+        "            d_out_buff.unsafe_ptr(),\n",
+        "            d_array_buff.unsafe_ptr(),\n",
+        "            SIZE,\n",
+        "            grid_dim=BLOCKS_PER_GRID,\n",
+        "            block_dim=THREADS_PER_BLOCK,\n",
+        "        )\n",
+        "\n",
+        "    ctx.synchronize()\n",
+        "\n",
+        "    with d_out_buff.map_to_host() as h_out_buff:\n",
+        "        print(h_out_buff)\n",
+        "        print(expected)\n",
+        "        for i in range(SIZE * SIZE ):\n",
+        "            assert_equal(h_out_buff[i], expected[i])\n",
+        "\n",
+        "  except e:\n",
+        "    print(e)"
+      ],
+      "metadata": {
+        "trusted": true,
+        "execution": {
+          "iopub.status.busy": "2025-05-14T12:13:26.016637Z",
+          "iopub.execute_input": "2025-05-14T12:13:26.017309Z",
+          "iopub.status.idle": "2025-05-14T12:13:26.022915Z",
+          "shell.execute_reply.started": "2025-05-14T12:13:26.017280Z",
+          "shell.execute_reply": "2025-05-14T12:13:26.022289Z"
+        },
+        "id": "lXVGbVNbuD3j",
+        "outputId": "6e70e176-460a-42b4-fa78-975bc9f13559"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "text": "Overwriting add_10_2d.mojo\n",
+          "output_type": "stream"
+        }
+      ],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic run mojo add_10_2d.mojo"
+      ],
+      "metadata": {
+        "trusted": true,
+        "execution": {
+          "iopub.status.busy": "2025-05-14T12:13:33.613285Z",
+          "iopub.execute_input": "2025-05-14T12:13:33.613855Z",
+          "iopub.status.idle": "2025-05-14T12:13:39.393304Z",
+          "shell.execute_reply.started": "2025-05-14T12:13:33.613833Z",
+          "shell.execute_reply": "2025-05-14T12:13:39.392434Z"
+        },
+        "id": "ruyQcZM7uD3j",
+        "outputId": "dcbd01c5-d6d4-41c7-85d3-dbf3c6995eac"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "text": "\u001b[2K\u001b[32mâ \u001b[0m activating environment                                                        Input:  HostBuffer([0.0, 1.0, 2.0, 3.0])\nHostBuffer([10.0, 11.0, 12.0, 13.0])\nHostBuffer([10.0, 11.0, 12.0, 13.0])\n",
+          "output_type": "stream"
+        }
+      ],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic run mojo format add_10_2d.mojo"
+      ],
+      "metadata": {
+        "trusted": true,
+        "id": "xYEQBuCeuD3k"
+      },
+      "outputs": [],
+      "execution_count": null
+    }
+  ]
+}
diff --git a/gpu_puzzles/add_10_2d.mojo b/gpu_puzzles/add_10_2d.mojo
@@ -0,0 +1,56 @@
+### Add a constant 10
+### Implement a kernel that adds 10 to each position of 2d matrix a and stores it in out 2d matrix.
+
+
+from gpu.host import DeviceContext
+from memory import UnsafePointer
+from gpu import thread_idx, block_dim
+from testing import assert_equal
+
+alias SIZE = 2
+alias BLOCKS_PER_GRID = 1
+alias THREADS_PER_BLOCK = (3,3)
+alias dtype = DType.float32
+
+
+fn add_10_2d(
+    out: UnsafePointer[Scalar[dtype]], array: UnsafePointer[Scalar[dtype]], size: Int
+):
+    tid = thread_idx.z * (block_dim.y * block_dim.x) + thread_idx.y * block_dim.x + thread_idx.x
+    if tid < size * size:
+        out[tid] = array[tid] + 10
+
+
+fn main():
+  try:
+    ctx = DeviceContext()
+    d_array_buff = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
+    d_out_buff = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
+    expected = ctx.enqueue_create_host_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
+
+
+    with d_array_buff.map_to_host() as h_array_buff:
+        for i in range(SIZE):
+            for j in range(SIZE):
+                h_array_buff[i * SIZE + j] = i * SIZE + j
+                expected[i * SIZE + j] = h_array_buff[i * SIZE + j] + 10
+        print("Input: ", h_array_buff)
+
+    ctx.enqueue_function[add_10_2d](
+            d_out_buff.unsafe_ptr(),
+            d_array_buff.unsafe_ptr(),
+            SIZE,
+            grid_dim=BLOCKS_PER_GRID,
+            block_dim=THREADS_PER_BLOCK,
+        )
+
+    ctx.synchronize()
+
+    with d_out_buff.map_to_host() as h_out_buff:
+        print(h_out_buff)
+        print(expected)
+        for i in range(SIZE * SIZE ):
+            assert_equal(h_out_buff[i], expected[i])
+
+  except e:
+    print(e)