ratulb
diff --git a/‎gpu_puzzles/add_10.ipynb‎
Lines changed: 178 additions & 0 deletions b/‎gpu_puzzles/add_10.ipynb‎
Lines changed: 178 additions & 0 deletions
diff --git a/‎gpu_puzzles/add_10.mojo‎
Lines changed: 44 additions & 0 deletions b/‎gpu_puzzles/add_10.mojo‎
Lines changed: 44 additions & 0 deletions
@@ -0,0 +1,178 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "!curl -ssL https://magic.modular.com/ | bash"
+      ],
+      "metadata": {
+        "id": "A8X6phvz7ZoQ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "os.environ['PATH'] += ':/root/.modular/bin'"
+      ],
+      "metadata": {
+        "id": "n7zS_6gK7fnB"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic init gpu_puzzles --format mojoproject"
+      ],
+      "metadata": {
+        "id": "Zlg5BNMn7j64"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%cd gpu_puzzles/"
+      ],
+      "metadata": {
+        "id": "mO77-mj17lsA"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%writefile add_10.mojo\n",
+        "\n",
+        "### Add 10\n",
+        "### Implement a kernel that adds 10 to each position of vector a and stores it in vector out.\n",
+        "\n",
+        "from gpu.host import DeviceContext\n",
+        "from memory import UnsafePointer\n",
+        "from gpu import thread_idx\n",
+        "\n",
+        "alias SIZE = 4\n",
+        "alias BLOCKS_PER_GRID = 1\n",
+        "alias THREADS_PER_BLOCK = SIZE\n",
+        "alias dtype = DType.float32\n",
+        "\n",
+        "\n",
+        "fn add_10(\n",
+        "    out: UnsafePointer[Scalar[dtype]], array: UnsafePointer[Scalar[dtype]]\n",
+        "):\n",
+        "    tid = thread_idx.x\n",
+        "    out[tid] = array[tid] + 10\n",
+        "\n",
+        "\n",
+        "fn main() raises:\n",
+        "    ctx = DeviceContext()\n",
+        "    d_array_buff = ctx.enqueue_create_buffer[dtype](SIZE)\n",
+        "    expected = ctx.enqueue_create_buffer[dtype](SIZE)\n",
+        "    d_out_buff = ctx.enqueue_create_buffer[dtype](SIZE)\n",
+        "\n",
+        "    _ = d_out_buff.enqueue_fill(0)\n",
+        "\n",
+        "    with d_array_buff.map_to_host() as h_array_buff:\n",
+        "        for i in range(SIZE):\n",
+        "            h_array_buff[i] = i\n",
+        "\n",
+        "    ctx.enqueue_function[add_10](\n",
+        "        d_out_buff.unsafe_ptr(),\n",
+        "        d_array_buff.unsafe_ptr(),\n",
+        "        grid_dim=BLOCKS_PER_GRID,\n",
+        "        block_dim=THREADS_PER_BLOCK,\n",
+        "    )\n",
+        "\n",
+        "    ctx.synchronize()\n",
+        "\n",
+        "    with d_out_buff.map_to_host() as h_out_buff:\n",
+        "        print(h_out_buff)\n"
+      ],
+      "metadata": {
+        "id": "UT3V1O2M7txw",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "6557f1f2-e6ff-4850-84c6-f7cc69954f57"
+      },
+      "execution_count": 25,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Overwriting add_10.mojo\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic run mojo add_10.mojo"
+      ],
+      "metadata": {
+        "id": "CkjRGISm7y1Q",
+        "outputId": "458efc1c-a004-43e8-b540-f973f7f26027",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "execution_count": 26,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[32m⠁\u001b[0m                                                                               \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2KHostBuffer([10.0, 11.0, 12.0, 13.0])\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic run mojo format add_10.mojo"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Cc2XVTrevpy5",
+        "outputId": "7608d036-835a-446b-96eb-68ffea96ba1e"
+      },
+      "execution_count": 27,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[32m⠁\u001b[0m                                                                               \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2K\u001b[1mreformatted add_10.mojo\u001b[0m\n",
+            "\n",
+            "\u001b[1mAll done! ✨ 🍰 ✨\u001b[0m\n",
+            "\u001b[34m\u001b[1m1 file \u001b[0m\u001b[1mreformatted\u001b[0m.\n"
+          ]
+        }
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "accelerator": "GPU"
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
@@ -0,0 +1,44 @@
+### Add 10
+### Implement a kernel that adds 10 to each position of vector a and stores it in vector out.
+
+from gpu.host import DeviceContext
+from memory import UnsafePointer
+from gpu import thread_idx
+
+alias SIZE = 4
+alias BLOCKS_PER_GRID = 1
+alias THREADS_PER_BLOCK = SIZE
+alias dtype = DType.float32
+
+
+fn add_10(
+    out: UnsafePointer[Scalar[dtype]], array: UnsafePointer[Scalar[dtype]]
+):
+    tid = thread_idx.x
+    out[tid] = array[tid] + 10
+
+
+fn main() raises:
+    ctx = DeviceContext()
+    d_array_buff = ctx.enqueue_create_buffer[dtype](SIZE)
+    expected = ctx.enqueue_create_buffer[dtype](SIZE)
+    d_out_buff = ctx.enqueue_create_buffer[dtype](SIZE)
+
+    _ = d_out_buff.enqueue_fill(0)
+
+    with d_array_buff.map_to_host() as h_array_buff:
+        for i in range(SIZE):
+            h_array_buff[i] = i
+
+    ctx.enqueue_function[add_10](
+        d_out_buff.unsafe_ptr(),
+        d_array_buff.unsafe_ptr(),
+        grid_dim=BLOCKS_PER_GRID,
+        block_dim=THREADS_PER_BLOCK,
+    )
+
+    ctx.synchronize()
+
+    with d_out_buff.map_to_host() as h_out_buff:
+        print(h_out_buff)
+