Skip to content

Commit 3ef12c5

Browse files
committed
Add LazyUDF example
1 parent a4a7b4c commit 3ef12c5

File tree

1 file changed

+97
-53
lines changed

1 file changed

+97
-53
lines changed

doc/getting_started/tutorials/03.lazyarray-udf.ipynb

Lines changed: 97 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -11,23 +11,23 @@
1111
},
1212
{
1313
"cell_type": "code",
14+
"execution_count": 1,
1415
"metadata": {
15-
"is_executing": true,
1616
"ExecuteTime": {
1717
"end_time": "2025-08-04T11:51:00.645630Z",
1818
"start_time": "2025-08-04T11:50:59.815878Z"
19-
}
19+
},
20+
"is_executing": true
2021
},
22+
"outputs": [],
2123
"source": [
2224
"import time\n",
2325
"\n",
2426
"import numba as nb\n",
2527
"import numpy as np\n",
2628
"\n",
2729
"import blosc2"
28-
],
29-
"outputs": [],
30-
"execution_count": 1
30+
]
3131
},
3232
{
3333
"cell_type": "markdown",
@@ -39,20 +39,20 @@
3939
},
4040
{
4141
"cell_type": "code",
42+
"execution_count": 2,
4243
"metadata": {
4344
"ExecuteTime": {
4445
"end_time": "2025-08-04T11:51:01.004841Z",
4546
"start_time": "2025-08-04T11:51:00.653637Z"
4647
}
4748
},
49+
"outputs": [],
4850
"source": [
4951
"shape = (5_000, 2_000)\n",
5052
"a = np.linspace(0, 1, np.prod(shape), dtype=np.int32).reshape(shape)\n",
5153
"b = blosc2.arange(np.prod(shape), dtype=np.float32, shape=shape)\n",
5254
"s = 2.1 # a regular scalar"
53-
],
54-
"outputs": [],
55-
"execution_count": 2
55+
]
5656
},
5757
{
5858
"cell_type": "markdown",
@@ -65,19 +65,19 @@
6565
},
6666
{
6767
"cell_type": "code",
68+
"execution_count": 3,
6869
"metadata": {
6970
"ExecuteTime": {
7071
"end_time": "2025-08-04T11:51:01.101741Z",
7172
"start_time": "2025-08-04T11:51:01.097265Z"
7273
}
7374
},
75+
"outputs": [],
7476
"source": [
7577
"def myudf(inputs_tuple, output, offset):\n",
7678
" x, y, s = inputs_tuple # at this point, all are either numpy arrays or scalars\n",
7779
" output[:] = x**3 + np.sin(y) + s + 1"
78-
],
79-
"outputs": [],
80-
"execution_count": 3
80+
]
8181
},
8282
{
8383
"cell_type": "markdown",
@@ -90,16 +90,13 @@
9090
},
9191
{
9292
"cell_type": "code",
93+
"execution_count": 4,
9394
"metadata": {
9495
"ExecuteTime": {
9596
"end_time": "2025-08-04T11:51:01.154177Z",
9697
"start_time": "2025-08-04T11:51:01.126220Z"
9798
}
9899
},
99-
"source": [
100-
"larray = blosc2.lazyudf(myudf, (a, b, s), a.dtype)\n",
101-
"print(f\"Type: {type(larray)}\")"
102-
],
103100
"outputs": [
104101
{
105102
"name": "stdout",
@@ -109,27 +106,27 @@
109106
]
110107
}
111108
],
112-
"execution_count": 4
109+
"source": [
110+
"larray = blosc2.lazyudf(myudf, (a, b, s), a.dtype)\n",
111+
"print(f\"Type: {type(larray)}\")"
112+
]
113113
},
114114
{
115115
"cell_type": "markdown",
116116
"metadata": {},
117-
"source": "Since the ``LazyUDF`` object implements the same ``LazyArray`` interface as ``LazyExpr``, we may execute and get the result of the function via either of the `__getitem__` (returning a NumPy array) and `compute` (returning a NDArray array) methods. Let's see `__getitem__` first, computing either a slice or the whole result:"
117+
"source": [
118+
"Since the ``LazyUDF`` object implements the same ``LazyArray`` interface as ``LazyExpr``, we may execute and get the result of the function via either of the `__getitem__` (returning a NumPy array) and `compute` (returning a NDArray array) methods. Let's see `__getitem__` first, computing either a slice or the whole result:"
119+
]
118120
},
119121
{
120122
"cell_type": "code",
123+
"execution_count": 5,
121124
"metadata": {
122125
"ExecuteTime": {
123126
"end_time": "2025-08-04T11:51:01.393097Z",
124127
"start_time": "2025-08-04T11:51:01.164244Z"
125128
}
126129
},
127-
"source": [
128-
"npc = larray[:10] # compute a slice of the result\n",
129-
"print(f\"Slice - Type: {type(npc)}, shape: {npc.shape}\")\n",
130-
"npc = larray[:] # compute the whole result\n",
131-
"print(f\"Full array - Type: {type(npc)}, shape: {npc.shape}\")"
132-
],
133130
"outputs": [
134131
{
135132
"name": "stdout",
@@ -140,7 +137,12 @@
140137
]
141138
}
142139
],
143-
"execution_count": 5
140+
"source": [
141+
"npc = larray[:10] # compute a slice of the result\n",
142+
"print(f\"Slice - Type: {type(npc)}, shape: {npc.shape}\")\n",
143+
"npc = larray[:] # compute the whole result\n",
144+
"print(f\"Full array - Type: {type(npc)}, shape: {npc.shape}\")"
145+
]
144146
},
145147
{
146148
"cell_type": "markdown",
@@ -152,18 +154,13 @@
152154
},
153155
{
154156
"cell_type": "code",
157+
"execution_count": 6,
155158
"metadata": {
156159
"ExecuteTime": {
157160
"end_time": "2025-08-04T11:51:01.603539Z",
158161
"start_time": "2025-08-04T11:51:01.403269Z"
159162
}
160163
},
161-
"source": [
162-
"c = larray.compute(urlpath=\"larray.b2nd\", mode=\"w\")\n",
163-
"print(f\"Type: {type(c)}\")\n",
164-
"print(c.info)\n",
165-
"blosc2.remove_urlpath(\"larray.b2nd\") # clean-up"
166-
],
167164
"outputs": [
168165
{
169166
"name": "stdout",
@@ -172,21 +169,66 @@
172169
"Type: <class 'blosc2.ndarray.NDArray'>\n",
173170
"type : NDArray\n",
174171
"shape : (5000, 2000)\n",
175-
"chunks : (625, 2000)\n",
176-
"blocks : (40, 2000)\n",
172+
"chunks : (1000, 2000)\n",
173+
"blocks : (25, 2000)\n",
177174
"dtype : int32\n",
178-
"cratio : 293.26\n",
179-
"cparams : CParams(codec=<Codec.ZSTD: 5>, codec_meta=0, clevel=1, use_dict=False, typesize=4,\n",
180-
" : nthreads=10, blocksize=320000, splitmode=<SplitMode.AUTO_SPLIT: 3>,\n",
175+
"nbytes : 40000000\n",
176+
"cbytes : 75294\n",
177+
"cratio : 531.25\n",
178+
"cparams : CParams(codec=<Codec.ZSTD: 5>, codec_meta=0, clevel=5, use_dict=False, typesize=4,\n",
179+
" : nthreads=28, blocksize=200000, splitmode=<SplitMode.AUTO_SPLIT: 3>,\n",
181180
" : filters=[<Filter.NOFILTER: 0>, <Filter.NOFILTER: 0>, <Filter.NOFILTER: 0>,\n",
182181
" : <Filter.NOFILTER: 0>, <Filter.NOFILTER: 0>, <Filter.SHUFFLE: 1>], filters_meta=[0, 0,\n",
183182
" : 0, 0, 0, 0], tuner=<Tuner.STUNE: 0>)\n",
184-
"dparams : DParams(nthreads=10)\n",
183+
"dparams : DParams(nthreads=28)\n",
185184
"\n"
186185
]
187186
}
188187
],
189-
"execution_count": 6
188+
"source": [
189+
"c = larray.compute(urlpath=\"larray.b2nd\", mode=\"w\")\n",
190+
"print(f\"Type: {type(c)}\")\n",
191+
"print(c.info)\n",
192+
"blosc2.remove_urlpath(\"larray.b2nd\") # clean-up"
193+
]
194+
},
195+
{
196+
"cell_type": "markdown",
197+
"metadata": {},
198+
"source": [
199+
"### Saving to disk\n",
200+
"As for ``blosc2.Lazyexpr`` objects, one may save the ``LazyUDF`` to disk (so long as the inputs are also on-disk)."
201+
]
202+
},
203+
{
204+
"cell_type": "code",
205+
"execution_count": 9,
206+
"metadata": {},
207+
"outputs": [
208+
{
209+
"name": "stdout",
210+
"output_type": "stream",
211+
"text": [
212+
"type : LazyUDF\n",
213+
"inputs : {'o0': '<NDArray> (5000, 2000) int32',\n",
214+
" 'o1': '<NDArray> (5000, 2000) int32',\n",
215+
" 'o2': '<NDArray> (5000, 2000) int32'}\n",
216+
"shape : (5000, 2000)\n",
217+
"dtype : int32\n",
218+
"\n",
219+
"Result shape: (5000, 2000)\n"
220+
]
221+
}
222+
],
223+
"source": [
224+
"arr = blosc2.asarray(a, urlpath=\"arr.b2nd\", mode=\"w\")\n",
225+
"c = blosc2.lazyudf(myudf, (arr, arr, arr), arr.dtype)\n",
226+
"c.save(urlpath=\"udf.b2nd\")\n",
227+
"c2 = blosc2.open(\"udf.b2nd\")\n",
228+
"print(c2.info)\n",
229+
"d2 = c2.compute()\n",
230+
"print(f\"Result shape: {d2.shape}\")"
231+
]
190232
},
191233
{
192234
"cell_type": "markdown",
@@ -198,12 +240,14 @@
198240
},
199241
{
200242
"cell_type": "code",
243+
"execution_count": 7,
201244
"metadata": {
202245
"ExecuteTime": {
203246
"end_time": "2025-08-04T11:51:01.684087Z",
204247
"start_time": "2025-08-04T11:51:01.620200Z"
205248
}
206249
},
250+
"outputs": [],
207251
"source": [
208252
"@nb.jit(nopython=True, parallel=True)\n",
209253
"def myudf_numba(inputs_tuple, output, offset):\n",
@@ -212,33 +256,24 @@
212256
"\n",
213257
"\n",
214258
"larray_nb = blosc2.lazyudf(myudf_numba, (a, b, s), a.dtype)"
215-
],
216-
"outputs": [],
217-
"execution_count": 7
259+
]
218260
},
219261
{
220262
"cell_type": "markdown",
221263
"metadata": {},
222-
"source": "We then use the ``lazyudf`` constructor as before. Cool! Now, let's evaluate it and compare timings with the pure Python version."
264+
"source": [
265+
"We then use the ``lazyudf`` constructor as before. Cool! Now, let's evaluate it and compare timings with the pure Python version."
266+
]
223267
},
224268
{
225269
"cell_type": "code",
270+
"execution_count": 8,
226271
"metadata": {
227272
"ExecuteTime": {
228273
"end_time": "2025-08-04T11:51:06.808378Z",
229274
"start_time": "2025-08-04T11:51:01.697185Z"
230275
}
231276
},
232-
"source": [
233-
"t1 = time.time()\n",
234-
"npc_nb = larray_nb[:] # numba version\n",
235-
"t_nb = time.time() - t1\n",
236-
"\n",
237-
"t1 = time.time()\n",
238-
"npc = larray[:] # pure python version\n",
239-
"t_ = time.time() - t1\n",
240-
"print(f\"Numba: {t_nb:.3f} seconds, pure Python: {t_:.3f} seconds\")"
241-
],
242277
"outputs": [
243278
{
244279
"name": "stdout",
@@ -248,7 +283,16 @@
248283
]
249284
}
250285
],
251-
"execution_count": 8
286+
"source": [
287+
"t1 = time.time()\n",
288+
"npc_nb = larray_nb[:] # numba version\n",
289+
"t_nb = time.time() - t1\n",
290+
"\n",
291+
"t1 = time.time()\n",
292+
"npc = larray[:] # pure python version\n",
293+
"t_ = time.time() - t1\n",
294+
"print(f\"Numba: {t_nb:.3f} seconds, pure Python: {t_:.3f} seconds\")"
295+
]
252296
},
253297
{
254298
"cell_type": "markdown",
@@ -284,7 +328,7 @@
284328
"name": "python",
285329
"nbconvert_exporter": "python",
286330
"pygments_lexer": "ipython3",
287-
"version": "3.12.7"
331+
"version": "3.13.7"
288332
}
289333
},
290334
"nbformat": 4,

0 commit comments

Comments
 (0)