Histogram kernel

ratulb · ratulb · commit 78af51a5bce0 · 2025-05-16T16:26:35.000Z
diff --git a/mojo_kernels/histogram.ipynb b/mojo_kernels/histogram.ipynb
@@ -0,0 +1,247 @@
+{
+  "metadata": {
+    "kernelspec": {
+      "language": "python",
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.11.11",
+      "mimetype": "text/x-python",
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "pygments_lexer": "ipython3",
+      "nbconvert_exporter": "python",
+      "file_extension": ".py"
+    },
+    "kaggle": {
+      "accelerator": "nvidiaTeslaT4",
+      "dataSources": [],
+      "dockerImageVersionId": 31041,
+      "isInternetEnabled": true,
+      "language": "python",
+      "sourceType": "notebook",
+      "isGpuEnabled": true
+    },
+    "colab": {
+      "provenance": []
+    }
+  },
+  "nbformat_minor": 0,
+  "nbformat": 4,
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "!curl -ssL https://magic.modular.com/ | bash"
+      ],
+      "metadata": {
+        "trusted": true,
+        "id": "BTCO9U2456-k"
+      },
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "os.environ['PATH'] +=':/root/.modular/bin'"
+      ],
+      "metadata": {
+        "trusted": true,
+        "id": "nlCI99K156-m"
+      },
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic init gpu_puzzles --format mojoproject"
+      ],
+      "metadata": {
+        "trusted": true,
+        "id": "TWvNvJN256-m"
+      },
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%cd gpu_puzzles/"
+      ],
+      "metadata": {
+        "trusted": true,
+        "id": "cO0pUIs-56-n"
+      },
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%writefile histogram.mojo\n",
+        "\n",
+        "### Histogram\n",
+        "### Program to compute histogram of a 1D array\n",
+        "\n",
+        "from gpu.host import DeviceContext, HostBuffer, DeviceBuffer\n",
+        "from gpu import thread_idx, block_idx, block_dim\n",
+        "import random\n",
+        "from math import ceildiv\n",
+        "from memory import UnsafePointer\n",
+        "from layout import Layout, LayoutTensor\n",
+        "from os import Atomic\n",
+        "from os.atomic import Consistency\n",
+        "\n",
+        "alias dtype = DType.int64\n",
+        "# How many numbers to bin? 2 ^ 20 (default)\n",
+        "alias ELEMS_COUNT = 1 << 20\n",
+        "# How many bins?\n",
+        "alias NUM_BINS = 10\n",
+        "# Num threads per block\n",
+        "alias THREADS = 256\n",
+        "# Total numbers blocks in the grid\n",
+        "alias BLOCKS = ceildiv(ELEMS_COUNT, THREADS)\n",
+        "\n",
+        "# Max value of any binned element\n",
+        "alias MAX_ELEM = 101\n",
+        "alias MIN_ELEM = 1\n",
+        "\n",
+        "alias BIN_WIDTH = (MAX_ELEM - MIN_ELEM + 1) // NUM_BINS\n",
+        "alias input_layout = Layout.row_major(ELEMS_COUNT)\n",
+        "\n",
+        "\n",
+        "\n",
+        "fn histogram(input: LayoutTensor[dtype, input_layout, MutableAnyOrigin], output: UnsafePointer[Scalar[dtype]], total_elems: Int):\n",
+        "    var tid = block_idx.x * block_dim.x + thread_idx.x\n",
+        "\n",
+        "    if tid < total_elems:\n",
+        "        var elem = input[tid]\n",
+        "        bin_index = bin_index(elem[0])\n",
+        "        #_ = Atomic.fetch_add[ordering= Consistency.MONOTONIC](output + bin_index, 1)\n",
+        "        _ = Atomic.fetch_add(output + bin_index, 1)\n",
+        "\n",
+        "\n",
+        "# Initialize the input buffer with values in the range 0 to 100\n",
+        "fn fill_buffer(buffer: HostBuffer[dtype]):\n",
+        "    # Randomize\n",
+        "    random.seed()\n",
+        "    for i in range(len(buffer)):\n",
+        "        buffer[i] = random.random_ui64(MIN_ELEM, MAX_ELEM).cast[dtype]()[0]\n",
+        "\n",
+        "# Find the bin index given a number\n",
+        "@always_inline\n",
+        "fn bin_index(elem: Int64) -> Int:\n",
+        "    bin_index = Int((elem - MIN_ELEM) // BIN_WIDTH)\n",
+        "    if bin_index >= NUM_BINS:\n",
+        "            bin_index = NUM_BINS - 1\n",
+        "    elif bin_index < 0:\n",
+        "        bin_index = 0\n",
+        "    return bin_index\n",
+        "\n",
+        "\n",
+        "fn main():\n",
+        "    try:\n",
+        "        ctx = DeviceContext()\n",
+        "\n",
+        "        elements = ctx.enqueue_create_buffer[dtype](ELEMS_COUNT)\n",
+        "        bins = ctx.enqueue_create_buffer[dtype](NUM_BINS).enqueue_fill(0)\n",
+        "\n",
+        "        with elements.map_to_host() as host_elements:\n",
+        "            fill_buffer(host_elements)\n",
+        "\n",
+        "        input_tensor = LayoutTensor[dtype, input_layout, MutableAnyOrigin](elements)\n",
+        "\n",
+        "        ctx.enqueue_function[histogram](input_tensor, bins.unsafe_ptr(), ELEMS_COUNT,\n",
+        "            grid_dim=BLOCKS, block_dim=THREADS\n",
+        "        )\n",
+        "\n",
+        "        ctx.synchronize()\n",
+        "\n",
+        "        with bins.map_to_host() as bins_host:\n",
+        "            print(bins_host)\n",
+        "\n",
+        "\n",
+        "        print(ctx.name())\n",
+        "    except e:\n",
+        "        print(\"Prininting here: \", e)"
+      ],
+      "metadata": {
+        "trusted": true,
+        "execution": {
+          "iopub.status.busy": "2025-05-16T16:20:06.536260Z",
+          "iopub.execute_input": "2025-05-16T16:20:06.536552Z",
+          "iopub.status.idle": "2025-05-16T16:20:06.542802Z",
+          "shell.execute_reply.started": "2025-05-16T16:20:06.536524Z",
+          "shell.execute_reply": "2025-05-16T16:20:06.542252Z"
+        },
+        "id": "JRUruBFe56-n",
+        "outputId": "91dee3ef-e61e-49be-e80d-123a4ec0c8ca"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "text": "Overwriting histogram.mojo\n",
+          "output_type": "stream"
+        }
+      ],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic run mojo histogram.mojo"
+      ],
+      "metadata": {
+        "trusted": true,
+        "execution": {
+          "iopub.status.busy": "2025-05-16T16:20:37.738801Z",
+          "iopub.execute_input": "2025-05-16T16:20:37.739075Z",
+          "iopub.status.idle": "2025-05-16T16:20:38.890961Z",
+          "shell.execute_reply.started": "2025-05-16T16:20:37.739049Z",
+          "shell.execute_reply": "2025-05-16T16:20:38.890297Z"
+        },
+        "id": "Pq-Zyr1L56-o",
+        "outputId": "a678f26f-061a-4dc7-ef59-4d7b136362c5"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "text": "\u001b[2K\u001b[32mâ \u001b[0m activating environment                                                        HostBuffer([103435, 103603, 104026, 103629, 103548, 104964, 103544, 103254, 103962, 114611])\nTesla T4\n",
+          "output_type": "stream"
+        }
+      ],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic run mojo format histogram.mojo"
+      ],
+      "metadata": {
+        "trusted": true,
+        "id": "qV8bAVgB56-o"
+      },
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!cat histogram.mojo"
+      ],
+      "metadata": {
+        "trusted": true,
+        "id": "Os7BbtUC56-p"
+      },
+      "outputs": [],
+      "execution_count": null
+    }
+  ]
+}
diff --git a/mojo_kernels/histogram.mojo b/mojo_kernels/histogram.mojo
@@ -0,0 +1,94 @@
+### Histogram
+### Program to compute histogram of a 1D array
+
+from gpu.host import DeviceContext, HostBuffer, DeviceBuffer
+from gpu import thread_idx, block_idx, block_dim
+import random
+from math import ceildiv
+from memory import UnsafePointer
+from layout import Layout, LayoutTensor
+from os import Atomic
+from os.atomic import Consistency
+
+alias dtype = DType.int64
+# How many numbers to bin? 2 ^ 20 (default)
+alias ELEMS_COUNT = 1 << 20
+# How many bins?
+alias NUM_BINS = 10
+# Num threads per block
+alias THREADS = 256
+# Total numbers blocks in the grid
+alias BLOCKS = ceildiv(ELEMS_COUNT, THREADS)
+
+# Max value of any binned element
+alias MAX_ELEM = 101
+alias MIN_ELEM = 1
+
+alias BIN_WIDTH = (MAX_ELEM - MIN_ELEM + 1) // NUM_BINS
+alias input_layout = Layout.row_major(ELEMS_COUNT)
+
+
+fn histogram(
+    input: LayoutTensor[dtype, input_layout, MutableAnyOrigin],
+    output: UnsafePointer[Scalar[dtype]],
+    total_elems: Int,
+):
+    var tid = block_idx.x * block_dim.x + thread_idx.x
+
+    if tid < total_elems:
+        var elem = input[tid]
+        bin_index = bin_index(elem[0])
+        # _ = Atomic.fetch_add[ordering= Consistency.MONOTONIC](output + bin_index, 1)
+        _ = Atomic.fetch_add(output + bin_index, 1)
+
+
+# Initialize the input buffer with values in the range 0 to 100
+fn fill_buffer(buffer: HostBuffer[dtype]):
+    # Randomize
+    random.seed()
+    for i in range(len(buffer)):
+        buffer[i] = random.random_ui64(MIN_ELEM, MAX_ELEM).cast[dtype]()[0]
+
+
+# Find the bin index given a number
+@always_inline
+fn bin_index(elem: Int64) -> Int:
+    bin_index = Int((elem - MIN_ELEM) // BIN_WIDTH)
+    if bin_index >= NUM_BINS:
+        bin_index = NUM_BINS - 1
+    elif bin_index < 0:
+        bin_index = 0
+    return bin_index
+
+
+fn main():
+    try:
+        ctx = DeviceContext()
+
+        elements = ctx.enqueue_create_buffer[dtype](ELEMS_COUNT)
+        bins = ctx.enqueue_create_buffer[dtype](NUM_BINS).enqueue_fill(0)
+
+        with elements.map_to_host() as host_elements:
+            fill_buffer(host_elements)
+
+        input_tensor = LayoutTensor[dtype, input_layout, MutableAnyOrigin](
+            elements
+        )
+        # output_tensor = LayoutTensor[mut=True, dtype, output_layout](bins)
+
+        ctx.enqueue_function[histogram](
+            input_tensor,
+            bins.unsafe_ptr(),
+            ELEMS_COUNT,
+            grid_dim=BLOCKS,
+            block_dim=THREADS,
+        )
+
+        ctx.synchronize()
+
+        with bins.map_to_host() as bins_host:
+            print(bins_host)
+
+        print(ctx.name())
+    except e:
+        print("Prininting here: ", e)