Add LazyUDF example

lshaw8317 · lshaw8317 · commit 3ef12c53e09a · 2025-12-01T13:07:16.000+01:00
diff --git a/doc/getting_started/tutorials/03.lazyarray-udf.ipynb b/doc/getting_started/tutorials/03.lazyarray-udf.ipynb
@@ -11,23 +11,23 @@
   },
   {
    "cell_type": "code",
+   "execution_count": 1,
    "metadata": {
-    "is_executing": true,
     "ExecuteTime": {
      "end_time": "2025-08-04T11:51:00.645630Z",
      "start_time": "2025-08-04T11:50:59.815878Z"
-    }
+    },
+    "is_executing": true
    },
+   "outputs": [],
    "source": [
     "import time\n",
     "\n",
     "import numba as nb\n",
     "import numpy as np\n",
     "\n",
     "import blosc2"
-   ],
-   "outputs": [],
-   "execution_count": 1
+   ]
   },
   {
    "cell_type": "markdown",
@@ -39,20 +39,20 @@
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2025-08-04T11:51:01.004841Z",
      "start_time": "2025-08-04T11:51:00.653637Z"
     }
    },
+   "outputs": [],
    "source": [
     "shape = (5_000, 2_000)\n",
     "a = np.linspace(0, 1, np.prod(shape), dtype=np.int32).reshape(shape)\n",
     "b = blosc2.arange(np.prod(shape), dtype=np.float32, shape=shape)\n",
     "s = 2.1  # a regular scalar"
-   ],
-   "outputs": [],
-   "execution_count": 2
+   ]
   },
   {
    "cell_type": "markdown",
@@ -65,19 +65,19 @@
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2025-08-04T11:51:01.101741Z",
      "start_time": "2025-08-04T11:51:01.097265Z"
     }
    },
+   "outputs": [],
    "source": [
     "def myudf(inputs_tuple, output, offset):\n",
     "    x, y, s = inputs_tuple  # at this point, all are either numpy arrays or scalars\n",
     "    output[:] = x**3 + np.sin(y) + s + 1"
-   ],
-   "outputs": [],
-   "execution_count": 3
+   ]
   },
   {
    "cell_type": "markdown",
@@ -90,16 +90,13 @@
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2025-08-04T11:51:01.154177Z",
      "start_time": "2025-08-04T11:51:01.126220Z"
     }
    },
-   "source": [
-    "larray = blosc2.lazyudf(myudf, (a, b, s), a.dtype)\n",
-    "print(f\"Type: {type(larray)}\")"
-   ],
    "outputs": [
     {
      "name": "stdout",
@@ -109,27 +106,27 @@
      ]
     }
    ],
-   "execution_count": 4
+   "source": [
+    "larray = blosc2.lazyudf(myudf, (a, b, s), a.dtype)\n",
+    "print(f\"Type: {type(larray)}\")"
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": "Since the ``LazyUDF`` object implements the same ``LazyArray`` interface as ``LazyExpr``, we may execute and get the result of the function via either of the `__getitem__` (returning a NumPy array) and `compute` (returning a NDArray array) methods. Let's see `__getitem__` first, computing either a slice or the whole result:"
+   "source": [
+    "Since the ``LazyUDF`` object implements the same ``LazyArray`` interface as ``LazyExpr``, we may execute and get the result of the function via either of the `__getitem__` (returning a NumPy array) and `compute` (returning a NDArray array) methods. Let's see `__getitem__` first, computing either a slice or the whole result:"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2025-08-04T11:51:01.393097Z",
      "start_time": "2025-08-04T11:51:01.164244Z"
     }
    },
-   "source": [
-    "npc = larray[:10]  # compute a slice of the result\n",
-    "print(f\"Slice - Type: {type(npc)}, shape: {npc.shape}\")\n",
-    "npc = larray[:]  # compute the whole result\n",
-    "print(f\"Full array - Type: {type(npc)}, shape: {npc.shape}\")"
-   ],
    "outputs": [
     {
      "name": "stdout",
@@ -140,7 +137,12 @@
      ]
     }
    ],
-   "execution_count": 5
+   "source": [
+    "npc = larray[:10]  # compute a slice of the result\n",
+    "print(f\"Slice - Type: {type(npc)}, shape: {npc.shape}\")\n",
+    "npc = larray[:]  # compute the whole result\n",
+    "print(f\"Full array - Type: {type(npc)}, shape: {npc.shape}\")"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -152,18 +154,13 @@
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2025-08-04T11:51:01.603539Z",
      "start_time": "2025-08-04T11:51:01.403269Z"
     }
    },
-   "source": [
-    "c = larray.compute(urlpath=\"larray.b2nd\", mode=\"w\")\n",
-    "print(f\"Type: {type(c)}\")\n",
-    "print(c.info)\n",
-    "blosc2.remove_urlpath(\"larray.b2nd\")  # clean-up"
-   ],
    "outputs": [
     {
      "name": "stdout",
@@ -172,21 +169,66 @@
       "Type: <class 'blosc2.ndarray.NDArray'>\n",
       "type    : NDArray\n",
       "shape   : (5000, 2000)\n",
-      "chunks  : (625, 2000)\n",
-      "blocks  : (40, 2000)\n",
+      "chunks  : (1000, 2000)\n",
+      "blocks  : (25, 2000)\n",
       "dtype   : int32\n",
-      "cratio  : 293.26\n",
-      "cparams : CParams(codec=<Codec.ZSTD: 5>, codec_meta=0, clevel=1, use_dict=False, typesize=4,\n",
-      "        : nthreads=10, blocksize=320000, splitmode=<SplitMode.AUTO_SPLIT: 3>,\n",
+      "nbytes  : 40000000\n",
+      "cbytes  : 75294\n",
+      "cratio  : 531.25\n",
+      "cparams : CParams(codec=<Codec.ZSTD: 5>, codec_meta=0, clevel=5, use_dict=False, typesize=4,\n",
+      "        : nthreads=28, blocksize=200000, splitmode=<SplitMode.AUTO_SPLIT: 3>,\n",
       "        : filters=[<Filter.NOFILTER: 0>, <Filter.NOFILTER: 0>, <Filter.NOFILTER: 0>,\n",
       "        : <Filter.NOFILTER: 0>, <Filter.NOFILTER: 0>, <Filter.SHUFFLE: 1>], filters_meta=[0, 0,\n",
       "        : 0, 0, 0, 0], tuner=<Tuner.STUNE: 0>)\n",
-      "dparams : DParams(nthreads=10)\n",
+      "dparams : DParams(nthreads=28)\n",
       "\n"
      ]
     }
    ],
-   "execution_count": 6
+   "source": [
+    "c = larray.compute(urlpath=\"larray.b2nd\", mode=\"w\")\n",
+    "print(f\"Type: {type(c)}\")\n",
+    "print(c.info)\n",
+    "blosc2.remove_urlpath(\"larray.b2nd\")  # clean-up"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Saving to disk\n",
+    "As for ``blosc2.Lazyexpr`` objects, one may save the ``LazyUDF`` to disk (so long as the inputs are also on-disk)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "type   : LazyUDF\n",
+      "inputs : {'o0': '<NDArray> (5000, 2000) int32',\n",
+      " 'o1': '<NDArray> (5000, 2000) int32',\n",
+      " 'o2': '<NDArray> (5000, 2000) int32'}\n",
+      "shape  : (5000, 2000)\n",
+      "dtype  : int32\n",
+      "\n",
+      "Result shape: (5000, 2000)\n"
+     ]
+    }
+   ],
+   "source": [
+    "arr = blosc2.asarray(a, urlpath=\"arr.b2nd\", mode=\"w\")\n",
+    "c = blosc2.lazyudf(myudf, (arr, arr, arr), arr.dtype)\n",
+    "c.save(urlpath=\"udf.b2nd\")\n",
+    "c2 = blosc2.open(\"udf.b2nd\")\n",
+    "print(c2.info)\n",
+    "d2 = c2.compute()\n",
+    "print(f\"Result shape: {d2.shape}\")"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -198,12 +240,14 @@
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2025-08-04T11:51:01.684087Z",
      "start_time": "2025-08-04T11:51:01.620200Z"
     }
    },
+   "outputs": [],
    "source": [
     "@nb.jit(nopython=True, parallel=True)\n",
     "def myudf_numba(inputs_tuple, output, offset):\n",
@@ -212,33 +256,24 @@
     "\n",
     "\n",
     "larray_nb = blosc2.lazyudf(myudf_numba, (a, b, s), a.dtype)"
-   ],
-   "outputs": [],
-   "execution_count": 7
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": "We then use the ``lazyudf`` constructor as before. Cool! Now, let's evaluate it and compare timings with the pure Python version."
+   "source": [
+    "We then use the ``lazyudf`` constructor as before. Cool! Now, let's evaluate it and compare timings with the pure Python version."
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 8,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2025-08-04T11:51:06.808378Z",
      "start_time": "2025-08-04T11:51:01.697185Z"
     }
    },
-   "source": [
-    "t1 = time.time()\n",
-    "npc_nb = larray_nb[:]  # numba version\n",
-    "t_nb = time.time() - t1\n",
-    "\n",
-    "t1 = time.time()\n",
-    "npc = larray[:]  # pure python version\n",
-    "t_ = time.time() - t1\n",
-    "print(f\"Numba: {t_nb:.3f} seconds, pure Python: {t_:.3f} seconds\")"
-   ],
    "outputs": [
     {
      "name": "stdout",
@@ -248,7 +283,16 @@
      ]
     }
    ],
-   "execution_count": 8
+   "source": [
+    "t1 = time.time()\n",
+    "npc_nb = larray_nb[:]  # numba version\n",
+    "t_nb = time.time() - t1\n",
+    "\n",
+    "t1 = time.time()\n",
+    "npc = larray[:]  # pure python version\n",
+    "t_ = time.time() - t1\n",
+    "print(f\"Numba: {t_nb:.3f} seconds, pure Python: {t_:.3f} seconds\")"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -284,7 +328,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.7"
+   "version": "3.13.7"
   }
  },
  "nbformat": 4,