|
11 | 11 | }, |
12 | 12 | { |
13 | 13 | "cell_type": "code", |
| 14 | + "execution_count": 1, |
14 | 15 | "metadata": { |
15 | | - "is_executing": true, |
16 | 16 | "ExecuteTime": { |
17 | 17 | "end_time": "2025-08-04T11:51:00.645630Z", |
18 | 18 | "start_time": "2025-08-04T11:50:59.815878Z" |
19 | | - } |
| 19 | + }, |
| 20 | + "is_executing": true |
20 | 21 | }, |
| 22 | + "outputs": [], |
21 | 23 | "source": [ |
22 | 24 | "import time\n", |
23 | 25 | "\n", |
24 | 26 | "import numba as nb\n", |
25 | 27 | "import numpy as np\n", |
26 | 28 | "\n", |
27 | 29 | "import blosc2" |
28 | | - ], |
29 | | - "outputs": [], |
30 | | - "execution_count": 1 |
| 30 | + ] |
31 | 31 | }, |
32 | 32 | { |
33 | 33 | "cell_type": "markdown", |
|
39 | 39 | }, |
40 | 40 | { |
41 | 41 | "cell_type": "code", |
| 42 | + "execution_count": 2, |
42 | 43 | "metadata": { |
43 | 44 | "ExecuteTime": { |
44 | 45 | "end_time": "2025-08-04T11:51:01.004841Z", |
45 | 46 | "start_time": "2025-08-04T11:51:00.653637Z" |
46 | 47 | } |
47 | 48 | }, |
| 49 | + "outputs": [], |
48 | 50 | "source": [ |
49 | 51 | "shape = (5_000, 2_000)\n", |
50 | 52 | "a = np.linspace(0, 1, np.prod(shape), dtype=np.int32).reshape(shape)\n", |
51 | 53 | "b = blosc2.arange(np.prod(shape), dtype=np.float32, shape=shape)\n", |
52 | 54 | "s = 2.1 # a regular scalar" |
53 | | - ], |
54 | | - "outputs": [], |
55 | | - "execution_count": 2 |
| 55 | + ] |
56 | 56 | }, |
57 | 57 | { |
58 | 58 | "cell_type": "markdown", |
|
65 | 65 | }, |
66 | 66 | { |
67 | 67 | "cell_type": "code", |
| 68 | + "execution_count": 3, |
68 | 69 | "metadata": { |
69 | 70 | "ExecuteTime": { |
70 | 71 | "end_time": "2025-08-04T11:51:01.101741Z", |
71 | 72 | "start_time": "2025-08-04T11:51:01.097265Z" |
72 | 73 | } |
73 | 74 | }, |
| 75 | + "outputs": [], |
74 | 76 | "source": [ |
75 | 77 | "def myudf(inputs_tuple, output, offset):\n", |
76 | 78 | " x, y, s = inputs_tuple # at this point, all are either numpy arrays or scalars\n", |
77 | 79 | " output[:] = x**3 + np.sin(y) + s + 1" |
78 | | - ], |
79 | | - "outputs": [], |
80 | | - "execution_count": 3 |
| 80 | + ] |
81 | 81 | }, |
82 | 82 | { |
83 | 83 | "cell_type": "markdown", |
|
90 | 90 | }, |
91 | 91 | { |
92 | 92 | "cell_type": "code", |
| 93 | + "execution_count": 4, |
93 | 94 | "metadata": { |
94 | 95 | "ExecuteTime": { |
95 | 96 | "end_time": "2025-08-04T11:51:01.154177Z", |
96 | 97 | "start_time": "2025-08-04T11:51:01.126220Z" |
97 | 98 | } |
98 | 99 | }, |
99 | | - "source": [ |
100 | | - "larray = blosc2.lazyudf(myudf, (a, b, s), a.dtype)\n", |
101 | | - "print(f\"Type: {type(larray)}\")" |
102 | | - ], |
103 | 100 | "outputs": [ |
104 | 101 | { |
105 | 102 | "name": "stdout", |
|
109 | 106 | ] |
110 | 107 | } |
111 | 108 | ], |
112 | | - "execution_count": 4 |
| 109 | + "source": [ |
| 110 | + "larray = blosc2.lazyudf(myudf, (a, b, s), a.dtype)\n", |
| 111 | + "print(f\"Type: {type(larray)}\")" |
| 112 | + ] |
113 | 113 | }, |
114 | 114 | { |
115 | 115 | "cell_type": "markdown", |
116 | 116 | "metadata": {}, |
117 | | - "source": "Since the ``LazyUDF`` object implements the same ``LazyArray`` interface as ``LazyExpr``, we may execute and get the result of the function via either of the `__getitem__` (returning a NumPy array) and `compute` (returning a NDArray array) methods. Let's see `__getitem__` first, computing either a slice or the whole result:" |
| 117 | + "source": [ |
| 118 | + "Since the ``LazyUDF`` object implements the same ``LazyArray`` interface as ``LazyExpr``, we may execute and get the result of the function via either of the `__getitem__` (returning a NumPy array) and `compute` (returning a NDArray array) methods. Let's see `__getitem__` first, computing either a slice or the whole result:" |
| 119 | + ] |
118 | 120 | }, |
119 | 121 | { |
120 | 122 | "cell_type": "code", |
| 123 | + "execution_count": 5, |
121 | 124 | "metadata": { |
122 | 125 | "ExecuteTime": { |
123 | 126 | "end_time": "2025-08-04T11:51:01.393097Z", |
124 | 127 | "start_time": "2025-08-04T11:51:01.164244Z" |
125 | 128 | } |
126 | 129 | }, |
127 | | - "source": [ |
128 | | - "npc = larray[:10] # compute a slice of the result\n", |
129 | | - "print(f\"Slice - Type: {type(npc)}, shape: {npc.shape}\")\n", |
130 | | - "npc = larray[:] # compute the whole result\n", |
131 | | - "print(f\"Full array - Type: {type(npc)}, shape: {npc.shape}\")" |
132 | | - ], |
133 | 130 | "outputs": [ |
134 | 131 | { |
135 | 132 | "name": "stdout", |
|
140 | 137 | ] |
141 | 138 | } |
142 | 139 | ], |
143 | | - "execution_count": 5 |
| 140 | + "source": [ |
| 141 | + "npc = larray[:10] # compute a slice of the result\n", |
| 142 | + "print(f\"Slice - Type: {type(npc)}, shape: {npc.shape}\")\n", |
| 143 | + "npc = larray[:] # compute the whole result\n", |
| 144 | + "print(f\"Full array - Type: {type(npc)}, shape: {npc.shape}\")" |
| 145 | + ] |
144 | 146 | }, |
145 | 147 | { |
146 | 148 | "cell_type": "markdown", |
|
152 | 154 | }, |
153 | 155 | { |
154 | 156 | "cell_type": "code", |
| 157 | + "execution_count": 6, |
155 | 158 | "metadata": { |
156 | 159 | "ExecuteTime": { |
157 | 160 | "end_time": "2025-08-04T11:51:01.603539Z", |
158 | 161 | "start_time": "2025-08-04T11:51:01.403269Z" |
159 | 162 | } |
160 | 163 | }, |
161 | | - "source": [ |
162 | | - "c = larray.compute(urlpath=\"larray.b2nd\", mode=\"w\")\n", |
163 | | - "print(f\"Type: {type(c)}\")\n", |
164 | | - "print(c.info)\n", |
165 | | - "blosc2.remove_urlpath(\"larray.b2nd\") # clean-up" |
166 | | - ], |
167 | 164 | "outputs": [ |
168 | 165 | { |
169 | 166 | "name": "stdout", |
|
172 | 169 | "Type: <class 'blosc2.ndarray.NDArray'>\n", |
173 | 170 | "type : NDArray\n", |
174 | 171 | "shape : (5000, 2000)\n", |
175 | | - "chunks : (625, 2000)\n", |
176 | | - "blocks : (40, 2000)\n", |
| 172 | + "chunks : (1000, 2000)\n", |
| 173 | + "blocks : (25, 2000)\n", |
177 | 174 | "dtype : int32\n", |
178 | | - "cratio : 293.26\n", |
179 | | - "cparams : CParams(codec=<Codec.ZSTD: 5>, codec_meta=0, clevel=1, use_dict=False, typesize=4,\n", |
180 | | - " : nthreads=10, blocksize=320000, splitmode=<SplitMode.AUTO_SPLIT: 3>,\n", |
| 175 | + "nbytes : 40000000\n", |
| 176 | + "cbytes : 75294\n", |
| 177 | + "cratio : 531.25\n", |
| 178 | + "cparams : CParams(codec=<Codec.ZSTD: 5>, codec_meta=0, clevel=5, use_dict=False, typesize=4,\n", |
| 179 | + " : nthreads=28, blocksize=200000, splitmode=<SplitMode.AUTO_SPLIT: 3>,\n", |
181 | 180 | " : filters=[<Filter.NOFILTER: 0>, <Filter.NOFILTER: 0>, <Filter.NOFILTER: 0>,\n", |
182 | 181 | " : <Filter.NOFILTER: 0>, <Filter.NOFILTER: 0>, <Filter.SHUFFLE: 1>], filters_meta=[0, 0,\n", |
183 | 182 | " : 0, 0, 0, 0], tuner=<Tuner.STUNE: 0>)\n", |
184 | | - "dparams : DParams(nthreads=10)\n", |
| 183 | + "dparams : DParams(nthreads=28)\n", |
185 | 184 | "\n" |
186 | 185 | ] |
187 | 186 | } |
188 | 187 | ], |
189 | | - "execution_count": 6 |
| 188 | + "source": [ |
| 189 | + "c = larray.compute(urlpath=\"larray.b2nd\", mode=\"w\")\n", |
| 190 | + "print(f\"Type: {type(c)}\")\n", |
| 191 | + "print(c.info)\n", |
| 192 | + "blosc2.remove_urlpath(\"larray.b2nd\") # clean-up" |
| 193 | + ] |
| 194 | + }, |
| 195 | + { |
| 196 | + "cell_type": "markdown", |
| 197 | + "metadata": {}, |
| 198 | + "source": [ |
| 199 | + "### Saving to disk\n", |
| 200 | + "As for ``blosc2.Lazyexpr`` objects, one may save the ``LazyUDF`` to disk (so long as the inputs are also on-disk)." |
| 201 | + ] |
| 202 | + }, |
| 203 | + { |
| 204 | + "cell_type": "code", |
| 205 | + "execution_count": 9, |
| 206 | + "metadata": {}, |
| 207 | + "outputs": [ |
| 208 | + { |
| 209 | + "name": "stdout", |
| 210 | + "output_type": "stream", |
| 211 | + "text": [ |
| 212 | + "type : LazyUDF\n", |
| 213 | + "inputs : {'o0': '<NDArray> (5000, 2000) int32',\n", |
| 214 | + " 'o1': '<NDArray> (5000, 2000) int32',\n", |
| 215 | + " 'o2': '<NDArray> (5000, 2000) int32'}\n", |
| 216 | + "shape : (5000, 2000)\n", |
| 217 | + "dtype : int32\n", |
| 218 | + "\n", |
| 219 | + "Result shape: (5000, 2000)\n" |
| 220 | + ] |
| 221 | + } |
| 222 | + ], |
| 223 | + "source": [ |
| 224 | + "arr = blosc2.asarray(a, urlpath=\"arr.b2nd\", mode=\"w\")\n", |
| 225 | + "c = blosc2.lazyudf(myudf, (arr, arr, arr), arr.dtype)\n", |
| 226 | + "c.save(urlpath=\"udf.b2nd\")\n", |
| 227 | + "c2 = blosc2.open(\"udf.b2nd\")\n", |
| 228 | + "print(c2.info)\n", |
| 229 | + "d2 = c2.compute()\n", |
| 230 | + "print(f\"Result shape: {d2.shape}\")" |
| 231 | + ] |
190 | 232 | }, |
191 | 233 | { |
192 | 234 | "cell_type": "markdown", |
|
198 | 240 | }, |
199 | 241 | { |
200 | 242 | "cell_type": "code", |
| 243 | + "execution_count": 7, |
201 | 244 | "metadata": { |
202 | 245 | "ExecuteTime": { |
203 | 246 | "end_time": "2025-08-04T11:51:01.684087Z", |
204 | 247 | "start_time": "2025-08-04T11:51:01.620200Z" |
205 | 248 | } |
206 | 249 | }, |
| 250 | + "outputs": [], |
207 | 251 | "source": [ |
208 | 252 | "@nb.jit(nopython=True, parallel=True)\n", |
209 | 253 | "def myudf_numba(inputs_tuple, output, offset):\n", |
|
212 | 256 | "\n", |
213 | 257 | "\n", |
214 | 258 | "larray_nb = blosc2.lazyudf(myudf_numba, (a, b, s), a.dtype)" |
215 | | - ], |
216 | | - "outputs": [], |
217 | | - "execution_count": 7 |
| 259 | + ] |
218 | 260 | }, |
219 | 261 | { |
220 | 262 | "cell_type": "markdown", |
221 | 263 | "metadata": {}, |
222 | | - "source": "We then use the ``lazyudf`` constructor as before. Cool! Now, let's evaluate it and compare timings with the pure Python version." |
| 264 | + "source": [ |
| 265 | + "We then use the ``lazyudf`` constructor as before. Cool! Now, let's evaluate it and compare timings with the pure Python version." |
| 266 | + ] |
223 | 267 | }, |
224 | 268 | { |
225 | 269 | "cell_type": "code", |
| 270 | + "execution_count": 8, |
226 | 271 | "metadata": { |
227 | 272 | "ExecuteTime": { |
228 | 273 | "end_time": "2025-08-04T11:51:06.808378Z", |
229 | 274 | "start_time": "2025-08-04T11:51:01.697185Z" |
230 | 275 | } |
231 | 276 | }, |
232 | | - "source": [ |
233 | | - "t1 = time.time()\n", |
234 | | - "npc_nb = larray_nb[:] # numba version\n", |
235 | | - "t_nb = time.time() - t1\n", |
236 | | - "\n", |
237 | | - "t1 = time.time()\n", |
238 | | - "npc = larray[:] # pure python version\n", |
239 | | - "t_ = time.time() - t1\n", |
240 | | - "print(f\"Numba: {t_nb:.3f} seconds, pure Python: {t_:.3f} seconds\")" |
241 | | - ], |
242 | 277 | "outputs": [ |
243 | 278 | { |
244 | 279 | "name": "stdout", |
|
248 | 283 | ] |
249 | 284 | } |
250 | 285 | ], |
251 | | - "execution_count": 8 |
| 286 | + "source": [ |
| 287 | + "t1 = time.time()\n", |
| 288 | + "npc_nb = larray_nb[:] # numba version\n", |
| 289 | + "t_nb = time.time() - t1\n", |
| 290 | + "\n", |
| 291 | + "t1 = time.time()\n", |
| 292 | + "npc = larray[:] # pure python version\n", |
| 293 | + "t_ = time.time() - t1\n", |
| 294 | + "print(f\"Numba: {t_nb:.3f} seconds, pure Python: {t_:.3f} seconds\")" |
| 295 | + ] |
252 | 296 | }, |
253 | 297 | { |
254 | 298 | "cell_type": "markdown", |
|
284 | 328 | "name": "python", |
285 | 329 | "nbconvert_exporter": "python", |
286 | 330 | "pygments_lexer": "ipython3", |
287 | | - "version": "3.12.7" |
| 331 | + "version": "3.13.7" |
288 | 332 | } |
289 | 333 | }, |
290 | 334 | "nbformat": 4, |
|
0 commit comments