Skip to content

Commit 78af51a

Browse files
committed
Histogram kernel
1 parent f8a4a13 commit 78af51a

File tree

2 files changed

+341
-0
lines changed

2 files changed

+341
-0
lines changed

mojo_kernels/histogram.ipynb

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
{
2+
"metadata": {
3+
"kernelspec": {
4+
"language": "python",
5+
"display_name": "Python 3",
6+
"name": "python3"
7+
},
8+
"language_info": {
9+
"name": "python",
10+
"version": "3.11.11",
11+
"mimetype": "text/x-python",
12+
"codemirror_mode": {
13+
"name": "ipython",
14+
"version": 3
15+
},
16+
"pygments_lexer": "ipython3",
17+
"nbconvert_exporter": "python",
18+
"file_extension": ".py"
19+
},
20+
"kaggle": {
21+
"accelerator": "nvidiaTeslaT4",
22+
"dataSources": [],
23+
"dockerImageVersionId": 31041,
24+
"isInternetEnabled": true,
25+
"language": "python",
26+
"sourceType": "notebook",
27+
"isGpuEnabled": true
28+
},
29+
"colab": {
30+
"provenance": []
31+
}
32+
},
33+
"nbformat_minor": 0,
34+
"nbformat": 4,
35+
"cells": [
36+
{
37+
"cell_type": "code",
38+
"source": [
39+
"!curl -ssL https://magic.modular.com/ | bash"
40+
],
41+
"metadata": {
42+
"trusted": true,
43+
"id": "BTCO9U2456-k"
44+
},
45+
"outputs": [],
46+
"execution_count": null
47+
},
48+
{
49+
"cell_type": "code",
50+
"source": [
51+
"import os\n",
52+
"os.environ['PATH'] +=':/root/.modular/bin'"
53+
],
54+
"metadata": {
55+
"trusted": true,
56+
"id": "nlCI99K156-m"
57+
},
58+
"outputs": [],
59+
"execution_count": null
60+
},
61+
{
62+
"cell_type": "code",
63+
"source": [
64+
"!magic init gpu_puzzles --format mojoproject"
65+
],
66+
"metadata": {
67+
"trusted": true,
68+
"id": "TWvNvJN256-m"
69+
},
70+
"outputs": [],
71+
"execution_count": null
72+
},
73+
{
74+
"cell_type": "code",
75+
"source": [
76+
"%cd gpu_puzzles/"
77+
],
78+
"metadata": {
79+
"trusted": true,
80+
"id": "cO0pUIs-56-n"
81+
},
82+
"outputs": [],
83+
"execution_count": null
84+
},
85+
{
86+
"cell_type": "code",
87+
"source": [
88+
"%%writefile histogram.mojo\n",
89+
"\n",
90+
"### Histogram\n",
91+
"### Program to compute histogram of a 1D array\n",
92+
"\n",
93+
"from gpu.host import DeviceContext, HostBuffer, DeviceBuffer\n",
94+
"from gpu import thread_idx, block_idx, block_dim\n",
95+
"import random\n",
96+
"from math import ceildiv\n",
97+
"from memory import UnsafePointer\n",
98+
"from layout import Layout, LayoutTensor\n",
99+
"from os import Atomic\n",
100+
"from os.atomic import Consistency\n",
101+
"\n",
102+
"alias dtype = DType.int64\n",
103+
"# How many numbers to bin? 2 ^ 20 (default)\n",
104+
"alias ELEMS_COUNT = 1 << 20\n",
105+
"# How many bins?\n",
106+
"alias NUM_BINS = 10\n",
107+
"# Num threads per block\n",
108+
"alias THREADS = 256\n",
109+
"# Total numbers blocks in the grid\n",
110+
"alias BLOCKS = ceildiv(ELEMS_COUNT, THREADS)\n",
111+
"\n",
112+
"# Max value of any binned element\n",
113+
"alias MAX_ELEM = 101\n",
114+
"alias MIN_ELEM = 1\n",
115+
"\n",
116+
"alias BIN_WIDTH = (MAX_ELEM - MIN_ELEM + 1) // NUM_BINS\n",
117+
"alias input_layout = Layout.row_major(ELEMS_COUNT)\n",
118+
"\n",
119+
"\n",
120+
"\n",
121+
"fn histogram(input: LayoutTensor[dtype, input_layout, MutableAnyOrigin], output: UnsafePointer[Scalar[dtype]], total_elems: Int):\n",
122+
" var tid = block_idx.x * block_dim.x + thread_idx.x\n",
123+
"\n",
124+
" if tid < total_elems:\n",
125+
" var elem = input[tid]\n",
126+
" bin_index = bin_index(elem[0])\n",
127+
" #_ = Atomic.fetch_add[ordering= Consistency.MONOTONIC](output + bin_index, 1)\n",
128+
" _ = Atomic.fetch_add(output + bin_index, 1)\n",
129+
"\n",
130+
"\n",
131+
"# Initialize the input buffer with values in the range 0 to 100\n",
132+
"fn fill_buffer(buffer: HostBuffer[dtype]):\n",
133+
" # Randomize\n",
134+
" random.seed()\n",
135+
" for i in range(len(buffer)):\n",
136+
" buffer[i] = random.random_ui64(MIN_ELEM, MAX_ELEM).cast[dtype]()[0]\n",
137+
"\n",
138+
"# Find the bin index given a number\n",
139+
"@always_inline\n",
140+
"fn bin_index(elem: Int64) -> Int:\n",
141+
" bin_index = Int((elem - MIN_ELEM) // BIN_WIDTH)\n",
142+
" if bin_index >= NUM_BINS:\n",
143+
" bin_index = NUM_BINS - 1\n",
144+
" elif bin_index < 0:\n",
145+
" bin_index = 0\n",
146+
" return bin_index\n",
147+
"\n",
148+
"\n",
149+
"fn main():\n",
150+
" try:\n",
151+
" ctx = DeviceContext()\n",
152+
"\n",
153+
" elements = ctx.enqueue_create_buffer[dtype](ELEMS_COUNT)\n",
154+
" bins = ctx.enqueue_create_buffer[dtype](NUM_BINS).enqueue_fill(0)\n",
155+
"\n",
156+
" with elements.map_to_host() as host_elements:\n",
157+
" fill_buffer(host_elements)\n",
158+
"\n",
159+
" input_tensor = LayoutTensor[dtype, input_layout, MutableAnyOrigin](elements)\n",
160+
"\n",
161+
" ctx.enqueue_function[histogram](input_tensor, bins.unsafe_ptr(), ELEMS_COUNT,\n",
162+
" grid_dim=BLOCKS, block_dim=THREADS\n",
163+
" )\n",
164+
"\n",
165+
" ctx.synchronize()\n",
166+
"\n",
167+
" with bins.map_to_host() as bins_host:\n",
168+
" print(bins_host)\n",
169+
"\n",
170+
"\n",
171+
" print(ctx.name())\n",
172+
" except e:\n",
173+
" print(\"Prininting here: \", e)"
174+
],
175+
"metadata": {
176+
"trusted": true,
177+
"execution": {
178+
"iopub.status.busy": "2025-05-16T16:20:06.536260Z",
179+
"iopub.execute_input": "2025-05-16T16:20:06.536552Z",
180+
"iopub.status.idle": "2025-05-16T16:20:06.542802Z",
181+
"shell.execute_reply.started": "2025-05-16T16:20:06.536524Z",
182+
"shell.execute_reply": "2025-05-16T16:20:06.542252Z"
183+
},
184+
"id": "JRUruBFe56-n",
185+
"outputId": "91dee3ef-e61e-49be-e80d-123a4ec0c8ca"
186+
},
187+
"outputs": [
188+
{
189+
"name": "stdout",
190+
"text": "Overwriting histogram.mojo\n",
191+
"output_type": "stream"
192+
}
193+
],
194+
"execution_count": null
195+
},
196+
{
197+
"cell_type": "code",
198+
"source": [
199+
"!magic run mojo histogram.mojo"
200+
],
201+
"metadata": {
202+
"trusted": true,
203+
"execution": {
204+
"iopub.status.busy": "2025-05-16T16:20:37.738801Z",
205+
"iopub.execute_input": "2025-05-16T16:20:37.739075Z",
206+
"iopub.status.idle": "2025-05-16T16:20:38.890961Z",
207+
"shell.execute_reply.started": "2025-05-16T16:20:37.739049Z",
208+
"shell.execute_reply": "2025-05-16T16:20:38.890297Z"
209+
},
210+
"id": "Pq-Zyr1L56-o",
211+
"outputId": "a678f26f-061a-4dc7-ef59-4d7b136362c5"
212+
},
213+
"outputs": [
214+
{
215+
"name": "stdout",
216+
"text": "\u001b[2K\u001b[32m⠁\u001b[0m activating environment HostBuffer([103435, 103603, 104026, 103629, 103548, 104964, 103544, 103254, 103962, 114611])\nTesla T4\n",
217+
"output_type": "stream"
218+
}
219+
],
220+
"execution_count": null
221+
},
222+
{
223+
"cell_type": "code",
224+
"source": [
225+
"!magic run mojo format histogram.mojo"
226+
],
227+
"metadata": {
228+
"trusted": true,
229+
"id": "qV8bAVgB56-o"
230+
},
231+
"outputs": [],
232+
"execution_count": null
233+
},
234+
{
235+
"cell_type": "code",
236+
"source": [
237+
"!cat histogram.mojo"
238+
],
239+
"metadata": {
240+
"trusted": true,
241+
"id": "Os7BbtUC56-p"
242+
},
243+
"outputs": [],
244+
"execution_count": null
245+
}
246+
]
247+
}

mojo_kernels/histogram.mojo

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
### Histogram
2+
### Program to compute histogram of a 1D array
3+
4+
from gpu.host import DeviceContext, HostBuffer, DeviceBuffer
5+
from gpu import thread_idx, block_idx, block_dim
6+
import random
7+
from math import ceildiv
8+
from memory import UnsafePointer
9+
from layout import Layout, LayoutTensor
10+
from os import Atomic
11+
from os.atomic import Consistency
12+
13+
alias dtype = DType.int64
14+
# How many numbers to bin? 2 ^ 20 (default)
15+
alias ELEMS_COUNT = 1 << 20
16+
# How many bins?
17+
alias NUM_BINS = 10
18+
# Num threads per block
19+
alias THREADS = 256
20+
# Total numbers blocks in the grid
21+
alias BLOCKS = ceildiv(ELEMS_COUNT, THREADS)
22+
23+
# Max value of any binned element
24+
alias MAX_ELEM = 101
25+
alias MIN_ELEM = 1
26+
27+
alias BIN_WIDTH = (MAX_ELEM - MIN_ELEM + 1) // NUM_BINS
28+
alias input_layout = Layout.row_major(ELEMS_COUNT)
29+
30+
31+
fn histogram(
32+
input: LayoutTensor[dtype, input_layout, MutableAnyOrigin],
33+
output: UnsafePointer[Scalar[dtype]],
34+
total_elems: Int,
35+
):
36+
var tid = block_idx.x * block_dim.x + thread_idx.x
37+
38+
if tid < total_elems:
39+
var elem = input[tid]
40+
bin_index = bin_index(elem[0])
41+
# _ = Atomic.fetch_add[ordering= Consistency.MONOTONIC](output + bin_index, 1)
42+
_ = Atomic.fetch_add(output + bin_index, 1)
43+
44+
45+
# Initialize the input buffer with values in the range 0 to 100
46+
fn fill_buffer(buffer: HostBuffer[dtype]):
47+
# Randomize
48+
random.seed()
49+
for i in range(len(buffer)):
50+
buffer[i] = random.random_ui64(MIN_ELEM, MAX_ELEM).cast[dtype]()[0]
51+
52+
53+
# Find the bin index given a number
54+
@always_inline
55+
fn bin_index(elem: Int64) -> Int:
56+
bin_index = Int((elem - MIN_ELEM) // BIN_WIDTH)
57+
if bin_index >= NUM_BINS:
58+
bin_index = NUM_BINS - 1
59+
elif bin_index < 0:
60+
bin_index = 0
61+
return bin_index
62+
63+
64+
fn main():
65+
try:
66+
ctx = DeviceContext()
67+
68+
elements = ctx.enqueue_create_buffer[dtype](ELEMS_COUNT)
69+
bins = ctx.enqueue_create_buffer[dtype](NUM_BINS).enqueue_fill(0)
70+
71+
with elements.map_to_host() as host_elements:
72+
fill_buffer(host_elements)
73+
74+
input_tensor = LayoutTensor[dtype, input_layout, MutableAnyOrigin](
75+
elements
76+
)
77+
# output_tensor = LayoutTensor[mut=True, dtype, output_layout](bins)
78+
79+
ctx.enqueue_function[histogram](
80+
input_tensor,
81+
bins.unsafe_ptr(),
82+
ELEMS_COUNT,
83+
grid_dim=BLOCKS,
84+
block_dim=THREADS,
85+
)
86+
87+
ctx.synchronize()
88+
89+
with bins.map_to_host() as bins_host:
90+
print(bins_host)
91+
92+
print(ctx.name())
93+
except e:
94+
print("Prininting here: ", e)

0 commit comments

Comments
 (0)