diff --git a/changes/3573.feature.md b/changes/3573.feature.md new file mode 100644 index 0000000000..cc06a2a4fe --- /dev/null +++ b/changes/3573.feature.md @@ -0,0 +1 @@ +Added new ``Array.chunk_slices`` and ``Array.shard_slices`` to get slices aligned with array chunks and shards respectively. diff --git a/docs/user-guide/arrays.md b/docs/user-guide/arrays.md index 25a1347fe3..3c4c1912da 100644 --- a/docs/user-guide/arrays.md +++ b/docs/user-guide/arrays.md @@ -566,6 +566,26 @@ In this example a shard shape of (1000, 1000) and a chunk shape of (100, 100) is This means that `10*10` chunks are stored in each shard, and there are `10*10` shards in total. Without the `shards` argument, there would be 10,000 chunks stored as individual files. +## Accessing chunks and shards + +Arrays have useful properties for accessing data aligned to chunks and shards. +This can be useful for getting slices that can be used to write to shards in parallel, or read from chunks in parallel. + +```python exec="true" session="arrays" source="above" result="ansi" +a = zarr.create_array(store={}, shape=(100, 50), shards=(50, 40), chunks=(25, 20), dtype='uint8') + +print("All shard slices:") +for shard_slice in a.shard_slices: + print(shard_slice) + # shard_data = a[shard_slice] + +print("All chunk slices:") +for chunk_slice in a.chunk_slices: + print(chunk_slice) + # chunk_data = a[chunk_slice] +``` + + ## Missing features in 3.0 The following features have not been ported to 3.0 yet. diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 8bd8be40b2..2bbcfafd69 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3,7 +3,7 @@ import json import warnings from asyncio import gather -from collections.abc import Iterable, Mapping +from collections.abc import Generator, Iterable, Mapping from dataclasses import dataclass, field, replace from itertools import starmap from logging import getLogger @@ -1381,6 +1381,32 @@ async def example(): async def nbytes_stored(self) -> int: return await self.store_path.store.getsize_prefix(self.store_path.path) + @property + def chunk_slices(self) -> Generator[tuple[slice, ...]]: + """ + Iterator over all chunks. + + Yields + ------ + chunk_slice : + Slice for each chunk in this array. + """ + yield from self._iter_chunk_regions() + + @property + def shard_slices(self) -> Generator[tuple[slice, ...]]: + """ + Iterator over all shards. + + This can be used to loop through and index every shard of an array. + + Yields + ------ + shard_slice : + Slice for each shard in this array. + """ + yield from self._iter_shard_regions() + def _iter_chunk_coords( self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None ) -> Iterator[tuple[int, ...]]: @@ -2355,6 +2381,34 @@ def shards(self) -> tuple[int, ...] | None: """ return self._async_array.shards + @property + def chunk_slices(self) -> Generator[tuple[slice, ...]]: + """ + Iterator over all chunks. + + This can be used to loop through and index every chunk of an array. + + Yields + ------ + chunk_slice : + Slice for each chunk in this array. + """ + yield from self._async_array.chunk_slices + + @property + def shard_slices(self) -> Generator[tuple[slice, ...]]: + """ + Iterator over all shards. + + This can be used to loop through and index every shard of an array. + + Yields + ------ + shard_slice : + Slice for each shard in this array. + """ + yield from self._async_array.shard_slices + @property def size(self) -> int: """Returns the total number of elements in the array. diff --git a/tests/test_array.py b/tests/test_array.py index 5219616739..c9d443c537 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -2153,3 +2153,24 @@ def test_create_array_with_data_num_gets( # one get for the metadata and one per shard. # Note: we don't actually need one get per shard, but this is the current behavior assert store.counter["get"] == 1 + num_shards + + +@pytest.mark.parametrize("shards", [None, (4, 6)]) +def test_chunk_slices(shards: None | tuple[int, ...]) -> None: + arr = zarr.create_array(store={}, shape=(4, 8), dtype="uint8", chunks=(2, 3), shards=shards) + assert list(arr.chunk_slices) == [ + (slice(0, 2, 1), slice(0, 3, 1)), + (slice(0, 2, 1), slice(3, 6, 1)), + (slice(0, 2, 1), slice(6, 8, 1)), + (slice(2, 4, 1), slice(0, 3, 1)), + (slice(2, 4, 1), slice(3, 6, 1)), + (slice(2, 4, 1), slice(6, 8, 1)), + ] + + +def test_shard_slices() -> None: + arr = zarr.create_array(store={}, shape=(4, 8), dtype="uint8", chunks=(2, 3), shards=(4, 6)) + assert list(arr.shard_slices) == [ + (slice(0, 4, 1), slice(0, 6, 1)), + (slice(0, 4, 1), slice(6, 8, 1)), + ]