ratulb
diff --git a/‎gpu_puzzles/naive_matmul.ipynb‎
Lines changed: 307 additions & 0 deletions b/‎gpu_puzzles/naive_matmul.ipynb‎
Lines changed: 307 additions & 0 deletions
@@ -0,0 +1,307 @@
+": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.11.11",
+      "mimetype": "text/x-python",
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "pygments_lexer": "ipython3",
+      "nbconvert_exporter": "python",
+      "file_extension": ".py"
+    },
+    "kaggle": {
+      "accelerator": "nvidiaTeslaT4",
+      "dataSources": [],
+      "dockerImageVersionId": 31041,
+      "isInternetEnabled": true,
+      "language": "python",
+      "sourceType": "notebook",
+      "isGpuEnabled": true
+    },
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "accelerator": "GPU"
+  },
+  "nbformat_minor": 0,
+  "nbformat": 4,
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "!curl -ssL https://magic.modular.com/ | bash"
+      ],
+      "metadata": {
+        "trusted": true,
+        "id": "buOgxm25ONit"
+      },
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "os.environ['PATH'] +=':/root/.modular/bin'"
+      ],
+      "metadata": {
+        "trusted": true,
+        "id": "FVZvyhRiONiw"
+      },
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic init gpu_puzzles --format mojoproject"
+      ],
+      "metadata": {
+        "trusted": true,
+        "id": "TqFD0EK0ONiw"
+      },
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%cd gpu_puzzles/"
+      ],
+      "metadata": {
+        "trusted": true,
+        "id": "k3Ddb6GcONiw"
+      },
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%writefile naive_matmaul.mojo\n",
+        "\n",
+        "### Dumb matrix multiplication\n",
+        "### Simulate the CPU-style triple for-loop truly dumb matrix multiplication\n",
+        "\n",
+        "from gpu.host import DeviceContext, HostBuffer\n",
+        "from gpu import thread_idx, block_idx, block_dim\n",
+        "import random\n",
+        "from layout import Layout, LayoutTensor\n",
+        "from memory import UnsafePointer, memcpy\n",
+        "from python import Python, PythonObject\n",
+        "from testing import assert_true\n",
+        "\n",
+        "\n",
+        "alias ROWS_A = 8\n",
+        "alias COLS_A = 16\n",
+        "alias ROWS_B = 16\n",
+        "alias COLS_B = 8\n",
+        "alias ROWS_C = 8\n",
+        "alias COLS_C = 8\n",
+        "\n",
+        "\n",
+        "alias MATRIX_MIN_ELEM = -5.0\n",
+        "alias MATRIX_MAX_ELEM = 5.0\n",
+        "\n",
+        "alias dtype = DType.float32\n",
+        "# Num threads per block\n",
+        "alias THREADS = 1\n",
+        "# Total numbers blocks in the grid\n",
+        "alias BLOCKS = 1\n",
+        "\n",
+        "alias layout_a = Layout.row_major(ROWS_A, COLS_A)\n",
+        "alias layout_b = Layout.row_major(ROWS_B, COLS_B)\n",
+        "alias layout_c = Layout.row_major(ROWS_C, COLS_C)\n",
+        "\n",
+        "# alias Matrix = LayoutTensor[dtype, _, MutableAnyOrigin]\n",
+        "alias Matrix = LayoutTensor[mut=True, dtype, _]\n",
+        "\n",
+        "\n",
+        "fn naive_matmaul(\n",
+        "    A: UnsafePointer[Scalar[dtype]],\n",
+        "    B: UnsafePointer[Scalar[dtype]],\n",
+        "    C: UnsafePointer[Scalar[dtype]],\n",
+        "):\n",
+        "    var tid = block_idx.x * block_dim.x + thread_idx.x\n",
+        "\n",
+        "    if tid == 0:\n",
+        "        for i in range(ROWS_A):\n",
+        "            for j in range(COLS_B):\n",
+        "                for k in range(COLS_A):\n",
+        "                    (C + i * COLS_C + j)[] += (A + i * COLS_A + k)[] * (\n",
+        "                        B + k * COLS_B + j\n",
+        "                    )[]\n",
+        "\n",
+        "\n",
+        "# Initialize the matrix buffer with values in the range 0 to 100\n",
+        "fn fill_buffer(buffer: HostBuffer[dtype]):\n",
+        "    # Randomize\n",
+        "    # random.seed()\n",
+        "    for i in range(len(buffer)):\n",
+        "        buffer[i] = random.random_float64(\n",
+        "            MATRIX_MIN_ELEM, MATRIX_MAX_ELEM\n",
+        "        ).cast[dtype]()[0]\n",
+        "\n",
+        "\n",
+        "fn main():\n",
+        "    try:\n",
+        "        ctx = DeviceContext()\n",
+        "\n",
+        "        buffer_a = ctx.enqueue_create_buffer[dtype](\n",
+        "            ROWS_A * COLS_A\n",
+        "        ).enqueue_fill(0.0)\n",
+        "        buffer_b = ctx.enqueue_create_buffer[dtype](\n",
+        "            ROWS_B * COLS_B\n",
+        "        ).enqueue_fill(0.0)\n",
+        "        buffer_c = ctx.enqueue_create_buffer[dtype](\n",
+        "            ROWS_C * COLS_C\n",
+        "        ).enqueue_fill(0.0)\n",
+        "\n",
+        "        with buffer_a.map_to_host() as h_buffer_a:\n",
+        "            fill_buffer(h_buffer_a)\n",
+        "\n",
+        "        with buffer_b.map_to_host() as h_buffer_b:\n",
+        "            fill_buffer(h_buffer_b)\n",
+        "\n",
+        "        # matrix_a = LayoutTensor[dtype, layout_a, MutableAnyOrigin](buffer_a)\n",
+        "        # matrix_b = LayoutTensor[dtype, layout_b, MutableAnyOrigin](buffer_b)\n",
+        "        # matrix_c =  LayoutTensor[dtype, layout_c, MutableAnyOrigin](buffer_c)\n",
+        "\n",
+        "        ctx.enqueue_function[naive_matmaul](\n",
+        "            buffer_a.unsafe_ptr(),\n",
+        "            buffer_b.unsafe_ptr(),\n",
+        "            buffer_c.unsafe_ptr(),\n",
+        "            grid_dim=BLOCKS,\n",
+        "            block_dim=THREADS,\n",
+        "        )\n",
+        "\n",
+        "        ctx.synchronize()\n",
+        "\n",
+        "        with buffer_a.map_to_host() as h_buffer_a:\n",
+        "            with buffer_b.map_to_host() as h_buffer_b:\n",
+        "                with buffer_c.map_to_host() as h_buffer_c:\n",
+        "                    assert_allclose(\n",
+        "                        (ROWS_A, COLS_A, h_buffer_a),\n",
+        "                        (ROWS_B, COLS_B, h_buffer_b),\n",
+        "                        (ROWS_C, COLS_C, h_buffer_c),\n",
+        "                    )\n",
+        "\n",
+        "    except e:\n",
+        "        print(\"Prininting here: \", e)\n",
+        "\n",
+        "\n",
+        "fn assert_allclose(\n",
+        "    buff_a_with_dims: (Int, Int, HostBuffer[dtype]),\n",
+        "    buff_b_with_dims: (Int, Int, HostBuffer[dtype]),\n",
+        "    buff_c_with_dims: (Int, Int, HostBuffer[dtype]),\n",
+        ") raises:\n",
+        "    a_rows, a_cols, a_buff = buff_a_with_dims\n",
+        "    matrix_a = reshape(to_ndarray(a_buff), a_rows, a_cols)\n",
+        "\n",
+        "    b_rows, b_cols, b_buff = buff_b_with_dims\n",
+        "    matrix_b = reshape(to_ndarray(b_buff), b_rows, b_cols)\n",
+        "\n",
+        "    c_rows, c_cols, c_buff = buff_c_with_dims\n",
+        "    matrix_c = reshape(to_ndarray(c_buff), c_rows, c_cols)\n",
+        "    np = Python.import_module(\"numpy\")\n",
+        "    assert_true(np.allclose(np.matmul(matrix_a, matrix_b), matrix_c))\n",
+        "    print(\"Assertion was successful\")\n",
+        "\n",
+        "\n",
+        "fn to_ndarray(buffer: HostBuffer[dtype]) raises -> PythonObject:\n",
+        "    np = Python.import_module(\"numpy\")\n",
+        "    ndarray = np.zeros(len(buffer), dtype=np.float32)\n",
+        "    ndarray_ptr = ndarray_ptr[dtype](ndarray)\n",
+        "    buffer_ptr = buffer.unsafe_ptr()\n",
+        "    memcpy(ndarray_ptr, buffer_ptr, len(buffer))\n",
+        "    return ndarray\n",
+        "\n",
+        "\n",
+        "fn reshape(ndarray: PythonObject, rows: Int, cols: Int) raises -> PythonObject:\n",
+        "    return ndarray.reshape(rows, cols)\n",
+        "\n",
+        "\n",
+        "fn ndarray_ptr[\n",
+        "    dtype: DType\n",
+        "](ndarray: PythonObject) raises -> UnsafePointer[Scalar[dtype]]:\n",
+        "    return ndarray.__array_interface__[\"data\"][0].unsafe_get_as_pointer[dtype]()"
+      ],
+      "metadata": {
+        "trusted": true,
+        "execution": {
+          "iopub.status.busy": "2025-05-17T17:07:37.176099Z",
+          "iopub.execute_input": "2025-05-17T17:07:37.176782Z",
+          "iopub.status.idle": "2025-05-17T17:07:37.183766Z",
+          "shell.execute_reply.started": "2025-05-17T17:07:37.176750Z",
+          "shell.execute_reply": "2025-05-17T17:07:37.183011Z"
+        },
+        "id": "IaxB1auxONix"
+      },
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic run mojo naive_matmaul.mojo"
+      ],
+      "metadata": {
+        "trusted": true,
+        "execution": {
+          "iopub.status.busy": "2025-05-17T17:07:53.713699Z",
+          "iopub.execute_input": "2025-05-17T17:07:53.713971Z",
+          "iopub.status.idle": "2025-05-17T17:08:01.501894Z",
+          "shell.execute_reply.started": "2025-05-17T17:07:53.713950Z",
+          "shell.execute_reply": "2025-05-17T17:08:01.501214Z"
+        },
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "h2k9wkDaONiz",
+        "outputId": "20528bac-8ce4-4cb5-829c-c4ed968d7172"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[32m⠁\u001b[0m                                                                               \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2K\u001b[1m/content/gpu_puzzles/naive_matmaul.mojo:1:1: \u001b[0m\u001b[0;1;35mwarning: \u001b[0m\u001b[1mstruct 'HostBuffer' utilizes conformance to trait 'Copyable & Movable' but does not explicitly declare it (implicit conformance is deprecated)\n",
+            "\u001b[0m### Dumb matrix multiplication\n",
+            "\u001b[0;1;32m^\n",
+            "\u001b[0m\u001b[1m/content/gpu_puzzles/naive_matmaul.mojo:1:1: \u001b[0m\u001b[0;1;35mwarning: \u001b[0m\u001b[1mstruct 'PythonObject' utilizes conformance to trait 'Boolable' but does not explicitly declare it (implicit conformance is deprecated)\n",
+            "\u001b[0m### Dumb matrix multiplication\n",
+            "\u001b[0;1;32m^\n",
+            "\u001b[0mAssertion was successful\n"
+          ]
+        }
+      ],
+      "execution_count": 8
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic run mojo format naive_matmaul.mojo"
+      ],
+      "metadata": {
+        "trusted": true,
+        "execution": {
+          "iopub.status.busy": "2025-05-17T17:05:36.835819Z",
+          "iopub.execute_input": "2025-05-17T17:05:36.836092Z",
+          "iopub.status.idle": "2025-05-17T17:05:37.268163Z",
+          "shell.execute_reply.started": "2025-05-17T17:05:36.836067Z",
+          "shell.execute_reply": "2025-05-17T17:05:37.267496Z"
+        },
+        "id": "bSglX7bNONi0"
+      },
+      "outputs": [],
+      "execution_count": null
+    }
+  ]
+}