Add vectors

ratulb · ratulb · commit 3deae99c78a7 · 2025-05-14T08:52:03.000Z
diff --git a/gpu_puzzles/add_10.ipynb b/gpu_puzzles/add_10.ipynb
@@ -56,6 +56,7 @@
         "from gpu.host import DeviceContext\n",
         "from memory import UnsafePointer\n",
         "from gpu import thread_idx\n",
+        "from testing import assert_equal\n",
         "\n",
         "alias SIZE = 4\n",
         "alias BLOCKS_PER_GRID = 1\n",
@@ -73,9 +74,8 @@
         "fn main() raises:\n",
         "    ctx = DeviceContext()\n",
         "    d_array_buff = ctx.enqueue_create_buffer[dtype](SIZE)\n",
-        "    expected = ctx.enqueue_create_buffer[dtype](SIZE)\n",
         "    d_out_buff = ctx.enqueue_create_buffer[dtype](SIZE)\n",
-        "\n",
+        "    expected = ctx.enqueue_create_host_buffer[dtype](SIZE)\n",
         "    _ = d_out_buff.enqueue_fill(0)\n",
         "\n",
         "    with d_array_buff.map_to_host() as h_array_buff:\n",
@@ -91,17 +91,22 @@
         "\n",
         "    ctx.synchronize()\n",
         "\n",
+        "    for i in range(SIZE):\n",
+        "        expected[i] = i + 10\n",
+        "\n",
         "    with d_out_buff.map_to_host() as h_out_buff:\n",
-        "        print(h_out_buff)\n"
+        "        print(h_out_buff)\n",
+        "        for i in range(SIZE):\n",
+        "            assert_equal(h_out_buff[i], expected[i])\n"
       ],
       "metadata": {
         "id": "UT3V1O2M7txw",
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
-        "outputId": "6557f1f2-e6ff-4850-84c6-f7cc69954f57"
+        "outputId": "30d37b0f-737b-4857-8b30-72cc17964d25"
       },
-      "execution_count": 25,
+      "execution_count": 30,
       "outputs": [
         {
           "output_type": "stream",
@@ -119,12 +124,12 @@
       ],
       "metadata": {
         "id": "CkjRGISm7y1Q",
-        "outputId": "458efc1c-a004-43e8-b540-f973f7f26027",
+        "outputId": "4b8f6798-4e2f-4228-d899-9492e1901abe",
         "colab": {
           "base_uri": "https://localhost:8080/"
         }
       },
-      "execution_count": 26,
+      "execution_count": 31,
       "outputs": [
         {
           "output_type": "stream",
diff --git a/gpu_puzzles/add_vectors.ipynb b/gpu_puzzles/add_vectors.ipynb
@@ -0,0 +1,237 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "!curl -ssL https://magic.modular.com/ | bash"
+      ],
+      "metadata": {
+        "id": "A8X6phvz7ZoQ",
+        "outputId": "21c85116-eb45-4bff-fe32-ba59589954be",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Installing the latest version of Magic...\n",
+            "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
+            "                                 Dload  Upload   Total   Spent    Left  Speed\n",
+            "  0     0    0     0    0     0      0      0 --:--:--  0:00:03 --:--:--     0\n",
+            "100 49.9M  100 49.9M    0     0  9574k      0  0:00:05  0:00:05 --:--:-- 72.0M\n",
+            "Done. The 'magic' binary is in '/root/.modular/bin'\n",
+            "\n",
+            "Two more steps:\n",
+            "1. To use 'magic', run this command so it's in your PATH:\n",
+            "source /root/.bashrc\n",
+            "2. To build with MAX and Mojo, go to http://modul.ar/get-started\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "os.environ['PATH'] += ':/root/.modular/bin'"
+      ],
+      "metadata": {
+        "id": "n7zS_6gK7fnB"
+      },
+      "execution_count": 2,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic init gpu_puzzles --format mojoproject"
+      ],
+      "metadata": {
+        "id": "Zlg5BNMn7j64",
+        "outputId": "d8c7e15e-af56-4a3e-8340-984d2df2c267",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "execution_count": 3,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[32m✔ \u001b[0mCreated /content/gpu_puzzles/mojoproject.toml\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%cd gpu_puzzles/"
+      ],
+      "metadata": {
+        "id": "mO77-mj17lsA",
+        "outputId": "669fb4cf-44e7-44f7-bb8a-bf7c0ebb2832",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "execution_count": 4,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "/content/gpu_puzzles\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%writefile add_vectors.mojo\n",
+        "\n",
+        "### Add vectors\n",
+        "### Mojo kernel for adding corresponding elements of vectors a and b, store in out.\n",
+        "\n",
+        "from gpu.host import DeviceContext\n",
+        "from memory import UnsafePointer\n",
+        "from gpu import thread_idx, block_idx, block_dim\n",
+        "from testing import assert_equal\n",
+        "\n",
+        "alias SIZE = 4\n",
+        "alias BLOCKS_PER_GRID = 1\n",
+        "alias THREADS_PER_BLOCK = SIZE\n",
+        "alias dtype = DType.float32\n",
+        "\n",
+        "\n",
+        "fn add(\n",
+        "    out: UnsafePointer[Scalar[dtype]],\n",
+        "    a: UnsafePointer[Scalar[dtype]],\n",
+        "    b: UnsafePointer[Scalar[dtype]],\n",
+        "):\n",
+        "    tid = block_idx.x * block_dim.x + thread_idx.x\n",
+        "    if tid < SIZE:\n",
+        "        out[tid] = a[tid] + b[tid]\n",
+        "\n",
+        "\n",
+        "fn main() raises:\n",
+        "    ctx = DeviceContext()\n",
+        "    d_array_buff_1 = ctx.enqueue_create_buffer[dtype](SIZE)\n",
+        "    d_array_buff_2 = ctx.enqueue_create_buffer[dtype](SIZE)\n",
+        "    d_out_buff = ctx.enqueue_create_buffer[dtype](SIZE)\n",
+        "    expected = ctx.enqueue_create_host_buffer[dtype](SIZE)\n",
+        "    _ = d_out_buff.enqueue_fill(0)\n",
+        "    _ = expected.enqueue_fill(SIZE - 1)\n",
+        "\n",
+        "    with d_array_buff_1.map_to_host() as h_array_buff_1:\n",
+        "        for i in range(SIZE):\n",
+        "            h_array_buff_1[i] = i\n",
+        "\n",
+        "    with d_array_buff_2.map_to_host() as h_array_buff_2:\n",
+        "        for i in range(SIZE - 1, -1, -1):\n",
+        "            h_array_buff_2[SIZE - 1 - i] = i\n",
+        "\n",
+        "    ctx.enqueue_function[add](\n",
+        "        d_out_buff.unsafe_ptr(),\n",
+        "        d_array_buff_1.unsafe_ptr(),\n",
+        "        d_array_buff_2.unsafe_ptr(),\n",
+        "        grid_dim=BLOCKS_PER_GRID,\n",
+        "        block_dim=THREADS_PER_BLOCK,\n",
+        "    )\n",
+        "\n",
+        "    ctx.synchronize()\n",
+        "\n",
+        "    with d_out_buff.map_to_host() as h_out_buff:\n",
+        "        print(h_out_buff)\n",
+        "        for i in range(SIZE):\n",
+        "            assert_equal(h_out_buff[i], expected[i])\n"
+      ],
+      "metadata": {
+        "id": "UT3V1O2M7txw",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "786e9c23-3d97-4238-fcac-117ba6e8555f"
+      },
+      "execution_count": 27,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Overwriting add_vectors.mojo\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic run mojo add_vectors.mojo"
+      ],
+      "metadata": {
+        "id": "CkjRGISm7y1Q",
+        "outputId": "d05fe142-8743-4ecc-9401-48e532b5ef0a",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "execution_count": 26,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[32m⠁\u001b[0m                                                                               \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2KHostBuffer([3.0, 3.0, 3.0, 3.0])\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic run mojo format add_vectors.mojo"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Cc2XVTrevpy5",
+        "outputId": "112f97cd-da96-4c3a-831e-f9f4eae633d9"
+      },
+      "execution_count": 24,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[32m⠁\u001b[0m                                                                               \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2K\u001b[1mreformatted add_vectors.mojo\u001b[0m\n",
+            "\n",
+            "\u001b[1mAll done! ✨ 🍰 ✨\u001b[0m\n",
+            "\u001b[34m\u001b[1m1 file \u001b[0m\u001b[1mreformatted\u001b[0m.\n"
+          ]
+        }
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "name": "Welcome To Colab",
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "accelerator": "GPU"
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/gpu_puzzles/add_vectors.mojo b/gpu_puzzles/add_vectors.mojo
@@ -0,0 +1,57 @@
+
+### Add vectors
+### Mojo kernel for adding corresponding elements of vectors a and b, store in out.
+
+from gpu.host import DeviceContext
+from memory import UnsafePointer
+from gpu import thread_idx, block_idx, block_dim
+from testing import assert_equal
+
+alias SIZE = 4
+alias BLOCKS_PER_GRID = 1
+alias THREADS_PER_BLOCK = SIZE
+alias dtype = DType.float32
+
+
+fn add(
+    out: UnsafePointer[Scalar[dtype]],
+    a: UnsafePointer[Scalar[dtype]],
+    b: UnsafePointer[Scalar[dtype]],
+):
+    tid = block_idx.x * block_dim.x + thread_idx.x
+    if tid < SIZE:
+        out[tid] = a[tid] + b[tid]
+
+
+fn main() raises:
+    ctx = DeviceContext()
+    d_array_buff_1 = ctx.enqueue_create_buffer[dtype](SIZE)
+    d_array_buff_2 = ctx.enqueue_create_buffer[dtype](SIZE)
+    d_out_buff = ctx.enqueue_create_buffer[dtype](SIZE)
+    expected = ctx.enqueue_create_host_buffer[dtype](SIZE)
+    _ = d_out_buff.enqueue_fill(0)
+    _ = expected.enqueue_fill(SIZE - 1)
+
+    with d_array_buff_1.map_to_host() as h_array_buff_1:
+        for i in range(SIZE):
+            h_array_buff_1[i] = i
+
+    with d_array_buff_2.map_to_host() as h_array_buff_2:
+        for i in range(SIZE - 1, -1, -1):
+            h_array_buff_2[SIZE - 1 - i] = i
+
+    ctx.enqueue_function[add](
+        d_out_buff.unsafe_ptr(),
+        d_array_buff_1.unsafe_ptr(),
+        d_array_buff_2.unsafe_ptr(),
+        grid_dim=BLOCKS_PER_GRID,
+        block_dim=THREADS_PER_BLOCK,
+    )
+
+    ctx.synchronize()
+
+    with d_out_buff.map_to_host() as h_out_buff:
+        print(h_out_buff)
+        for i in range(SIZE):
+            assert_equal(h_out_buff[i], expected[i])
+