From 0797c15e3a61a9ab28892c29b4c3b9b7a85f52a1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 10 Jul 2025 21:45:27 +0200 Subject: [PATCH 1/9] refactor default chunk encoding to skip config. add tests for deprecated config keys --- src/zarr/core/array.py | 109 +++++++++++++++++++++++++++++++--------- src/zarr/core/config.py | 41 ++++++++------- tests/test_config.py | 80 ++++++++++------------------- tests/test_v2.py | 57 ++++++++++----------- 4 files changed, 159 insertions(+), 128 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 312dc0bc4d..185f72e41a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -30,6 +30,8 @@ from zarr.abc.store import Store, set_or_delete from zarr.codecs._v2 import V2Codec from zarr.codecs.bytes import BytesCodec +from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec +from zarr.codecs.zstd import ZstdCodec from zarr.core._info import ArrayInfo from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config from zarr.core.attributes import Attributes @@ -72,7 +74,7 @@ ZDTypeLike, parse_data_type, ) -from zarr.core.dtype.common import HasEndianness, HasItemSize +from zarr.core.dtype.common import HasEndianness, HasItemSize, HasObjectCodec from zarr.core.indexing import ( BasicIndexer, BasicSelection, @@ -710,7 +712,10 @@ def _create_metadata_v3( shape = parse_shapelike(shape) if codecs is None: - filters, serializer, compressors = _get_default_chunk_encoding_v3(dtype) + filters = default_filters_v3(dtype) + serializer = default_serializer_v3(dtype) + compressors = default_compressors_v3(dtype) + codecs_parsed = (*filters, serializer, *compressors) else: codecs_parsed = tuple(codecs) @@ -850,10 +855,9 @@ async def _create_v2( else: await ensure_no_existing_node(store_path, zarr_format=2) - default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype) compressor_parsed: CompressorLikev2 if compressor == "auto": - compressor_parsed = default_compressor + compressor_parsed = default_compressor_v2(dtype) elif isinstance(compressor, BytesBytesCodec): raise ValueError( "Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. " @@ -863,7 +867,7 @@ async def _create_v2( compressor_parsed = compressor if filters is None: - filters = default_filters + filters = default_filters_v2(dtype) metadata = cls._create_metadata_v2( shape=shape, @@ -4654,19 +4658,80 @@ def _get_default_chunk_encoding_v3( ) -def _get_default_chunk_encoding_v2( - dtype: ZDType[TBaseDType, TBaseScalar], -) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: +def default_filters_v3(dtype: ZDType[Any, Any]) -> tuple[ArrayArrayCodec, ...]: """ - Get the default chunk encoding for Zarr format 2 arrays, given a dtype + Given a data type, return the default filters for that data type. + + This is an empty tuple. No data types have default filters. """ - dtype_category = categorize_data_type(dtype) - filters = zarr_config.get("array.v2_default_filters").get(dtype_category) - compressor = zarr_config.get("array.v2_default_compressor").get(dtype_category) - if filters is not None: - filters = tuple(numcodecs.get_codec(f) for f in filters) + return () - return filters, numcodecs.get_codec(compressor) + +def default_compressors_v3(dtype: ZDType[Any, Any]) -> tuple[BytesBytesCodec, ...]: + """ + Given a data type, return the default compressors for that data type. + + This is just a tuple containing ``ZstdCodec`` + """ + return (ZstdCodec(),) + + +def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec: + """ + Given a data type, return the default serializer for that data type. + + The default serializer for most data types is the ``BytesCodec``, which may or may not be + parameterized with an endianness, depending on whether the data type has endianness. Variable + length strings and variable length bytes have hard-coded serializers -- ``VLenUTF8Codec`` and + ``VLenBytesCodec``, respectively. + + """ + serializer: ArrayBytesCodec = BytesCodec() + + if isinstance(dtype, HasEndianness): + serializer = BytesCodec(endian="little") + elif isinstance(dtype, HasObjectCodec): + if dtype.object_codec_id == "vlen-bytes": + serializer = VLenBytesCodec() + elif dtype.object_codec_id == "vlen-utf8": + serializer = VLenUTF8Codec() + else: + msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id}" + raise ValueError(msg) + return serializer + + +def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | None: + """ + Given a data type, return the default filters for that data type. + + For data types that require an object codec, namely variable length data types, + this is a tuple containing the object codec. Otherwise it's ``None``. + """ + if isinstance(dtype, HasObjectCodec): + if dtype.object_codec_id == "vlen-bytes": + from numcodecs import VLenBytes + + return (VLenBytes(),) + elif dtype.object_codec_id == "vlen-utf8": + from numcodecs import VLenUTF8 + + return (VLenUTF8(),) + else: + msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id}" + raise ValueError(msg) + return None + + +def default_compressor_v2(dtype: ZDType[Any, Any]) -> numcodecs.abc.Codec: + """ + Given a data type, return the default compressors for that data type. + + This is just the numcodecs ``Zstd`` codec. + """ + from numcodecs import Zstd + + return Zstd(level=0, checksum=False) def _parse_chunk_encoding_v2( @@ -4678,14 +4743,13 @@ def _parse_chunk_encoding_v2( """ Generate chunk encoding classes for Zarr format 2 arrays with optional defaults. """ - default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype) _filters: tuple[numcodecs.abc.Codec, ...] | None _compressor: numcodecs.abc.Codec | None if compressor is None or compressor == (): _compressor = None elif compressor == "auto": - _compressor = default_compressor + _compressor = default_compressor_v2(dtype) elif isinstance(compressor, tuple | list) and len(compressor) == 1: _compressor = parse_compressor(compressor[0]) else: @@ -4697,7 +4761,7 @@ def _parse_chunk_encoding_v2( if filters is None: _filters = None elif filters == "auto": - _filters = default_filters + _filters = default_filters_v2(dtype) else: if isinstance(filters, Iterable): for idx, f in enumerate(filters): @@ -4722,14 +4786,11 @@ def _parse_chunk_encoding_v3( """ Generate chunk encoding classes for v3 arrays with optional defaults. """ - default_array_array, default_array_bytes, default_bytes_bytes = _get_default_chunk_encoding_v3( - dtype - ) if filters is None: out_array_array: tuple[ArrayArrayCodec, ...] = () elif filters == "auto": - out_array_array = default_array_array + out_array_array = default_filters_v3(dtype) else: maybe_array_array: Iterable[Codec | dict[str, JSON]] if isinstance(filters, dict | Codec): @@ -4739,7 +4800,7 @@ def _parse_chunk_encoding_v3( out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array) if serializer == "auto": - out_array_bytes = default_array_bytes + out_array_bytes = default_serializer_v3(dtype) else: # TODO: ensure that the serializer is compatible with the ndarray produced by the # array-array codecs. For example, if a sequence of array-array codecs produces an @@ -4749,7 +4810,7 @@ def _parse_chunk_encoding_v3( if compressors is None: out_bytes_bytes: tuple[BytesBytesCodec, ...] = () elif compressors == "auto": - out_bytes_bytes = default_bytes_bytes + out_bytes_bytes = default_compressors_v3(dtype) else: maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]] if isinstance(compressors, dict | Codec): diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 05d048ef74..cc3c33cd17 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -78,6 +78,25 @@ def enable_gpu(self) -> ConfigSet: ) +# these keys were removed from the config as part of the 3.1.0 release. +# these deprecations should be removed in 3.1.1 or thereabouts. +deprecations = { + "array.v2_default_compressor.numeric": None, + "array.v2_default_compressor.string": None, + "array.v2_default_compressor.bytes": None, + "array.v2_default_filters.string": None, + "array.v2_default_filters.bytes": None, + "array.v3_default_filters.numeric": None, + "array.v3_default_filters.raw": None, + "array.v3_default_filters.bytes": None, + "array.v3_default_serializer.numeric": None, + "array.v3_default_serializer.string": None, + "array.v3_default_serializer.bytes": None, + "array.v3_default_compressors.string": None, + "array.v3_default_compressors.bytes": None, + "array.v3_default_compressors": None, +} + # The default configuration for zarr config = Config( "zarr", @@ -87,27 +106,6 @@ def enable_gpu(self) -> ConfigSet: "array": { "order": "C", "write_empty_chunks": False, - "v2_default_compressor": { - "default": {"id": "zstd", "level": 0, "checksum": False}, - "variable-length-string": {"id": "zstd", "level": 0, "checksum": False}, - }, - "v2_default_filters": { - "default": None, - "variable-length-string": [{"id": "vlen-utf8"}], - }, - "v3_default_filters": {"default": [], "variable-length-string": []}, - "v3_default_serializer": { - "default": {"name": "bytes", "configuration": {"endian": "little"}}, - "variable-length-string": {"name": "vlen-utf8"}, - }, - "v3_default_compressors": { - "default": [ - {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ], - "variable-length-string": [ - {"name": "zstd", "configuration": {"level": 0, "checksum": False}} - ], - }, }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, @@ -132,6 +130,7 @@ def enable_gpu(self) -> ConfigSet: "ndbuffer": "zarr.buffer.cpu.NDBuffer", } ], + deprecations=deprecations, ) diff --git a/tests/test_config.py b/tests/test_config.py index e267601272..c59e721c49 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,6 +1,6 @@ import os from collections.abc import Iterable -from typing import TYPE_CHECKING, Any +from typing import Any from unittest import mock from unittest.mock import Mock @@ -16,16 +16,13 @@ BloscCodec, BytesCodec, Crc32cCodec, - GzipCodec, ShardingCodec, ) -from zarr.core.array import create_array from zarr.core.array_spec import ArraySpec from zarr.core.buffer import NDBuffer from zarr.core.buffer.core import Buffer from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.config import BadConfigError, config -from zarr.core.dtype import Int8, VariableLengthUTF8 from zarr.core.indexing import SelectorTuple from zarr.registry import ( fully_qualified_name, @@ -38,7 +35,6 @@ register_ndbuffer, register_pipeline, ) -from zarr.storage import MemoryStore from zarr.testing.buffer import ( NDBufferUsingTestNDArrayLike, StoreExpectingTestBuffer, @@ -46,9 +42,6 @@ TestNDArrayLike, ) -if TYPE_CHECKING: - from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType - def test_config_defaults_set() -> None: # regression test for available defaults @@ -60,27 +53,6 @@ def test_config_defaults_set() -> None: "array": { "order": "C", "write_empty_chunks": False, - "v2_default_compressor": { - "default": {"id": "zstd", "level": 0, "checksum": False}, - "variable-length-string": {"id": "zstd", "level": 0, "checksum": False}, - }, - "v2_default_filters": { - "default": None, - "variable-length-string": [{"id": "vlen-utf8"}], - }, - "v3_default_filters": {"default": [], "variable-length-string": []}, - "v3_default_serializer": { - "default": {"name": "bytes", "configuration": {"endian": "little"}}, - "variable-length-string": {"name": "vlen-utf8"}, - }, - "v3_default_compressors": { - "default": [ - {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ], - "variable-length-string": [ - {"name": "zstd", "configuration": {"level": 0, "checksum": False}} - ], - }, }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, @@ -323,29 +295,31 @@ class NewCodec2(BytesCodec): get_codec_class("new_codec") -@pytest.mark.parametrize("dtype_category", ["variable-length-string", "default"]) -@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") -async def test_default_codecs(dtype_category: str) -> None: +@pytest.mark.parametrize( + "key", + [ + "array.v2_default_compressor.numeric", + "array.v2_default_compressor.string", + "array.v2_default_compressor.bytes", + "array.v2_default_filters.string", + "array.v2_default_filters.bytes", + "array.v3_default_filters.numeric", + "array.v3_default_filters.raw", + "array.v3_default_filters.bytes", + "array.v3_default_serializer.numeric", + "array.v3_default_serializer.string", + "array.v3_default_serializer.bytes", + "array.v3_default_compressors.string", + "array.v3_default_compressors.bytes", + "array.v3_default_compressors", + ], +) +def test_deprecated_config(key: str) -> None: """ - Test that the default compressors are sensitive to the current setting of the config. + Test that a valuerror is raised when setting the default chunk encoding for a given + data type category """ - zdtype: ZDType[TBaseDType, TBaseScalar] - if dtype_category == "variable-length-string": - zdtype = VariableLengthUTF8() # type: ignore[assignment] - else: - zdtype = Int8() - expected_compressors = (GzipCodec(),) - new_conf = { - f"array.v3_default_compressors.{dtype_category}": [ - c.to_dict() for c in expected_compressors - ] - } - with config.set(new_conf): - arr = await create_array( - shape=(100,), - chunks=(100,), - dtype=zdtype, - zarr_format=3, - store=MemoryStore(), - ) - assert arr.compressors == expected_compressors + + with pytest.raises(ValueError): + with zarr.config.set({key: "foo"}): + pass diff --git a/tests/test_v2.py b/tests/test_v2.py index 29f031663f..4d17305995 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -73,37 +73,34 @@ def test_codec_pipeline() -> None: async def test_v2_encode_decode( dtype: str, expected_dtype: str, fill_value: bytes, fill_value_json: str ) -> None: - with config.set( - { - "array.v2_default_filters.bytes": [{"id": "vlen-bytes"}], - "array.v2_default_compressor.bytes": None, - } - ): - store = zarr.storage.MemoryStore() - g = zarr.group(store=store, zarr_format=2) - g.create_array( - name="foo", shape=(3,), chunks=(3,), dtype=dtype, fill_value=fill_value, compressor=None - ) + store = zarr.storage.MemoryStore() + g = zarr.group(store=store, zarr_format=2) + g.create_array( + name="foo", shape=(3,), chunks=(3,), dtype=dtype, fill_value=fill_value, compressor=None + ) - result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) - assert result is not None - - serialized = json.loads(result.to_bytes()) - expected = { - "chunks": [3], - "compressor": None, - "dtype": expected_dtype, - "fill_value": fill_value_json, - "filters": None, - "order": "C", - "shape": [3], - "zarr_format": 2, - "dimension_separator": ".", - } - assert serialized == expected - - data = zarr.open_array(store=store, path="foo")[:] - np.testing.assert_equal(data, np.full((3,), b"X", dtype=dtype)) + result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) + assert result is not None + + serialized = json.loads(result.to_bytes()) + expected = { + "chunks": [3], + "compressor": None, + "dtype": expected_dtype, + "fill_value": fill_value_json, + "filters": None, + "order": "C", + "shape": [3], + "zarr_format": 2, + "dimension_separator": ".", + } + assert serialized == expected + + data = zarr.open_array(store=store, path="foo")[:] + np.testing.assert_equal(data, np.full((3,), b"X", dtype=dtype)) + + data = zarr.open_array(store=store, path="foo")[:] + np.testing.assert_equal(data, np.full((3,), b"X", dtype=dtype)) @pytest.mark.parametrize( From 855901ca485279effbe86824f236edfb33fcd064 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 10 Jul 2025 22:03:14 +0200 Subject: [PATCH 2/9] remove chunk encoding configuration from docs --- docs/user-guide/arrays.rst | 10 ---------- docs/user-guide/config.rst | 20 +------------------- 2 files changed, 1 insertion(+), 29 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index f45dfbebe8..67b134d442 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -246,16 +246,6 @@ built-in delta filter:: >>> z.compressors (LZMA(codec_name='numcodecs.lzma', codec_config={'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]}),) -The default compressor can be changed by setting the value of the using Zarr's -:ref:`user-guide-config`, e.g.:: - - >>> with zarr.config.set({'array.v2_default_compressor.default': {'id': 'blosc'}}): - ... z = zarr.create_array(store={}, shape=(100000000,), chunks=(1000000,), dtype='int32', zarr_format=2) - >>> z.filters - () - >>> z.compressors - (Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),) - To disable compression, set ``compressors=None`` when creating an array, e.g.:: >>> z = zarr.create_array(store='data/example-8.zarr', shape=(100000000,), chunks=(1000000,), dtype='int32', compressors=None) diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst index 5a9d26f2b9..0ae8017ca9 100644 --- a/docs/user-guide/config.rst +++ b/docs/user-guide/config.rst @@ -43,25 +43,7 @@ This is the current default configuration:: >>> zarr.config.pprint() {'array': {'order': 'C', - 'v2_default_compressor': {'default': {'checksum': False, - 'id': 'zstd', - 'level': 0}, - 'variable-length-string': {'checksum': False, - 'id': 'zstd', - 'level': 0}}, - 'v2_default_filters': {'default': None, - 'variable-length-string': [{'id': 'vlen-utf8'}]}, - 'v3_default_compressors': {'default': [{'configuration': {'checksum': False, - 'level': 0}, - 'name': 'zstd'}], - 'variable-length-string': [{'configuration': {'checksum': False, - 'level': 0}, - 'name': 'zstd'}]}, - 'v3_default_filters': {'default': [], 'variable-length-string': []}, - 'v3_default_serializer': {'default': {'configuration': {'endian': 'little'}, - 'name': 'bytes'}, - 'variable-length-string': {'name': 'vlen-utf8'}}, - 'write_empty_chunks': False}, + 'write_empty_chunks': False}, 'async': {'concurrency': 10, 'timeout': None}, 'buffer': 'zarr.buffer.cpu.Buffer', 'codec_pipeline': {'batch_size': 1, From 7447805588a8689f2bc43cdd5507acf12c071eea Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 10 Jul 2025 23:33:35 +0200 Subject: [PATCH 3/9] don't create invalid string dtype arrays in test --- tests/test_array.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_array.py b/tests/test_array.py index 0bca860e84..da9aa7aa30 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1335,6 +1335,8 @@ async def test_invalid_v3_arguments( async def test_v2_chunk_encoding( store: MemoryStore, compressors: CompressorsLike, filters: FiltersLike, dtype: str ) -> None: + if dtype == "str" and filters != "auto": + pytest.skip("Only the auto filters are compatible with str dtype in this test.") arr = await create_array( store=store, dtype=dtype, From 173766d1bd248ab5440d2ceaf33588c7e98971d7 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 10 Jul 2025 23:34:39 +0200 Subject: [PATCH 4/9] add v2-style error when creating a vlen dtype without the right codec --- src/zarr/core/array.py | 43 ++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 185f72e41a..825741f855 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -70,6 +70,8 @@ from zarr.core.config import categorize_data_type from zarr.core.config import config as zarr_config from zarr.core.dtype import ( + VariableLengthBytes, + VariableLengthUTF8, ZDType, ZDTypeLike, parse_data_type, @@ -111,6 +113,7 @@ ) from zarr.core.metadata.v2 import ( CompressorLikev2, + get_object_codec_id, parse_compressor, parse_filters, ) @@ -4686,7 +4689,7 @@ def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec: ``VLenBytesCodec``, respectively. """ - serializer: ArrayBytesCodec = BytesCodec() + serializer: ArrayBytesCodec = BytesCodec(endian=None) if isinstance(dtype, HasEndianness): serializer = BytesCodec(endian="little") @@ -4772,7 +4775,33 @@ def _parse_chunk_encoding_v2( ) raise TypeError(msg) _filters = parse_filters(filters) - + if isinstance(dtype, HasObjectCodec): + # check the filters and the compressor for the object codec required for this data type + if _filters is None: + if _compressor is None: + object_codec_id = None + else: + object_codec_id = get_object_codec_id((_compressor.get_config(),)) + else: + object_codec_id = get_object_codec_id( + ( + *[f.get_config() for f in _filters], + _compressor.get_config() if _compressor is not None else None, + ) + ) + if object_codec_id is None: + if isinstance(dtype, VariableLengthUTF8): + codec_name = "the numcodecs.VLenUTF8 codec" + elif isinstance(dtype, VariableLengthBytes): + codec_name = "the numcodecs.VLenBytes codec" + else: + codec_name = "an unknown object codec" + msg = ( + f"Data type {dtype} requires {codec_name}, " + "but no such codec was specified in the filters or compressor parameters for " + "this array. " + ) + raise ValueError(msg) return _filters, _compressor @@ -4820,17 +4849,11 @@ def _parse_chunk_encoding_v3( out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) - # specialize codecs as needed given the dtype - - # TODO: refactor so that the config only contains the name of the codec, and we use the dtype - # to create the codec instance, instead of storing a dict representation of a full codec. - # TODO: ensure that the serializer is compatible with the ndarray produced by the # array-array codecs. For example, if a sequence of array-array codecs produces an # array with a single-byte data type, then the serializer should not specify endiannesss. - if isinstance(out_array_bytes, BytesCodec) and not isinstance(dtype, HasEndianness): - # The default endianness in the bytescodec might not be None, so we need to replace it - out_array_bytes = replace(out_array_bytes, endian=None) + + # TODO: add checks to ensure that the right serializer is used for vlen data types return out_array_array, out_array_bytes, out_bytes_bytes From f4d31a28875b1511efd356c8b773fe1b2b03812e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 10 Jul 2025 23:46:39 +0200 Subject: [PATCH 5/9] test for v2-style error when creating an object array without an object codec --- src/zarr/core/array.py | 4 +-- tests/test_array.py | 61 +++++++++++++++++++++++++++++++++++------- 2 files changed, 54 insertions(+), 11 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 825741f855..fc77567088 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4721,7 +4721,7 @@ def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | return (VLenUTF8(),) else: - msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id}" + msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}" raise ValueError(msg) return None @@ -4795,7 +4795,7 @@ def _parse_chunk_encoding_v2( elif isinstance(dtype, VariableLengthBytes): codec_name = "the numcodecs.VLenBytes codec" else: - codec_name = "an unknown object codec" + codec_name = f"an unknown object codec with id {dtype.object_codec_id!r}" msg = ( f"Data type {dtype} requires {codec_name}, " "but no such codec was specified in the filters or compressor parameters for " diff --git a/tests/test_array.py b/tests/test_array.py index da9aa7aa30..285df21dd5 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -41,17 +41,22 @@ from zarr.core.chunk_grids import _auto_partition from zarr.core.chunk_key_encodings import ChunkKeyEncodingParams from zarr.core.common import JSON, MemoryOrder, ZarrFormat -from zarr.core.dtype import parse_data_type -from zarr.core.dtype.common import ENDIANNESS_STR, EndiannessStr -from zarr.core.dtype.npy.common import NUMPY_ENDIANNESS_STR, endianness_from_numpy_str -from zarr.core.dtype.npy.float import Float32, Float64 -from zarr.core.dtype.npy.int import Int16, UInt8 -from zarr.core.dtype.npy.string import VariableLengthUTF8 -from zarr.core.dtype.npy.structured import ( +from zarr.core.dtype import ( + DateTime64, + Float32, + Float64, + Int16, Structured, + TimeDelta64, + UInt8, + VariableLengthBytes, + VariableLengthUTF8, + ZDType, + parse_data_type, ) -from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 -from zarr.core.dtype.wrapper import ZDType +from zarr.core.dtype.common import ENDIANNESS_STR, EndiannessStr +from zarr.core.dtype.npy.common import NUMPY_ENDIANNESS_STR, endianness_from_numpy_str +from zarr.core.dtype.npy.string import UTF8Base from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv from zarr.core.metadata.v2 import ArrayV2Metadata @@ -1850,3 +1855,41 @@ def test_array_repr(store: Store) -> None: dtype = "uint8" arr = zarr.create_array(store, shape=shape, dtype=dtype) assert str(arr) == f"" + + +class UnknownObjectDtype(UTF8Base[np.dtypes.ObjectDType]): + object_codec_id = "unknown" # type: ignore[assignment] + + def to_native_dtype(self) -> np.dtypes.ObjectDType: + """ + Create a NumPy object dtype from this VariableLengthUTF8 ZDType. + + Returns + ------- + np.dtypes.ObjectDType + The NumPy object dtype. + """ + return np.dtype("o") + + +@pytest.mark.parametrize( + "dtype", [VariableLengthUTF8(), VariableLengthBytes(), UnknownObjectDtype()] +) +def test_chunk_encoding_no_object_codec_errors(dtype: ZDType[Any, Any]) -> None: + """ + Test that a valuerror is raised when checking the chunk encoding for a v2 array with a + data type that requires an object codec, but where no object codec is specified + """ + if isinstance(dtype, VariableLengthUTF8): + codec_name = "the numcodecs.VLenUTF8 codec" + elif isinstance(dtype, VariableLengthBytes): + codec_name = "the numcodecs.VLenBytes codec" + else: + codec_name = f"an unknown object codec with id {dtype.object_codec_id!r}" + msg = ( + f"Data type {dtype} requires {codec_name}, " + "but no such codec was specified in the filters or compressor parameters for " + "this array. " + ) + with pytest.raises(ValueError, match=re.escape(msg)): + _parse_chunk_encoding_v2(filters=None, compressor=None, dtype=dtype) From 6bbd4dfc31201d509704c4f8cc0a2841c81dbd06 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 10 Jul 2025 23:50:52 +0200 Subject: [PATCH 6/9] lint --- src/zarr/core/array.py | 8 ++++---- tests/test_array.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index fc77567088..608843b861 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4790,10 +4790,10 @@ def _parse_chunk_encoding_v2( ) ) if object_codec_id is None: - if isinstance(dtype, VariableLengthUTF8): - codec_name = "the numcodecs.VLenUTF8 codec" - elif isinstance(dtype, VariableLengthBytes): - codec_name = "the numcodecs.VLenBytes codec" + if isinstance(dtype, VariableLengthUTF8): # type: ignore[unreachable] + codec_name = "the numcodecs.VLenUTF8 codec" # type: ignore[unreachable] + elif isinstance(dtype, VariableLengthBytes): # type: ignore[unreachable] + codec_name = "the numcodecs.VLenBytes codec" # type: ignore[unreachable] else: codec_name = f"an unknown object codec with id {dtype.object_codec_id!r}" msg = ( diff --git a/tests/test_array.py b/tests/test_array.py index 285df21dd5..1aca9ffb7a 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1869,7 +1869,7 @@ def to_native_dtype(self) -> np.dtypes.ObjectDType: np.dtypes.ObjectDType The NumPy object dtype. """ - return np.dtype("o") + return np.dtype("o") # type: ignore[return-value] @pytest.mark.parametrize( @@ -1885,7 +1885,7 @@ def test_chunk_encoding_no_object_codec_errors(dtype: ZDType[Any, Any]) -> None: elif isinstance(dtype, VariableLengthBytes): codec_name = "the numcodecs.VLenBytes codec" else: - codec_name = f"an unknown object codec with id {dtype.object_codec_id!r}" + codec_name = f"an unknown object codec with id {dtype.object_codec_id!r}" # type: ignore[attr-defined] msg = ( f"Data type {dtype} requires {codec_name}, " "but no such codec was specified in the filters or compressor parameters for " From 0d7f83b129774cfd18460e0c8a056b6a25d01f18 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 11 Jul 2025 00:03:34 +0200 Subject: [PATCH 7/9] changelog --- changes/3228.removal.rst | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 changes/3228.removal.rst diff --git a/changes/3228.removal.rst b/changes/3228.removal.rst new file mode 100644 index 0000000000..495d09f44a --- /dev/null +++ b/changes/3228.removal.rst @@ -0,0 +1,16 @@ +Removes default chunk encoding settings (filters, serializer, compressors) from the global +configuration object. + +This removal is justified on the basis that storing chunk encoding settings in the config required +a brittle, confusing, and inaccurate categorization of array data types, which was particularly +unsuitable after the recent addition of new data types that didn't fit naturally into the +pre-existing categories. + +The default chunk encoding is the same (Zstandard compression, and the required object codecs for +variable length data types), but the chunk encoding is now generated by functions that cannot be +reconfigured at runtime. Users who relied on setting the default chunk encoding via the global configuration object should +instead specify the desired chunk encoding explicitly when creating an array. + +This change also adds an extra validation step to the creation of Zarr V2 arrays, which ensures that +arrays with a ``VariableLengthUTF8`` or ``VariableLengthBytes`` cannot be created without the +correct "object codec". \ No newline at end of file From 52c1be1628695f52209224d9dae4282e850131cb Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Fri, 11 Jul 2025 10:25:29 +0200 Subject: [PATCH 8/9] Update 3228.removal.rst --- changes/3228.removal.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/changes/3228.removal.rst b/changes/3228.removal.rst index 495d09f44a..0463897755 100644 --- a/changes/3228.removal.rst +++ b/changes/3228.removal.rst @@ -12,5 +12,5 @@ reconfigured at runtime. Users who relied on setting the default chunk encoding instead specify the desired chunk encoding explicitly when creating an array. This change also adds an extra validation step to the creation of Zarr V2 arrays, which ensures that -arrays with a ``VariableLengthUTF8`` or ``VariableLengthBytes`` cannot be created without the -correct "object codec". \ No newline at end of file +arrays with a ``VariableLengthUTF8`` or ``VariableLengthBytes`` data type cannot be created without the +correct "object codec". From d61637f69f51c61a4d1ab886031345c7dc95af46 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 11 Jul 2025 12:21:57 +0200 Subject: [PATCH 9/9] test coverage --- src/zarr/core/array.py | 4 ++-- tests/test_array.py | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 608843b861..a4f7fc086a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4699,7 +4699,7 @@ def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec: elif dtype.object_codec_id == "vlen-utf8": serializer = VLenUTF8Codec() else: - msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id}" + msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}." raise ValueError(msg) return serializer @@ -4721,7 +4721,7 @@ def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | return (VLenUTF8(),) else: - msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}" + msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}." raise ValueError(msg) return None diff --git a/tests/test_array.py b/tests/test_array.py index 1aca9ffb7a..4783bca05c 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -35,6 +35,8 @@ _parse_chunk_encoding_v3, chunks_initialized, create_array, + default_filters_v2, + default_serializer_v3, ) from zarr.core.buffer import NDArrayLike, NDArrayLikeOrScalar, default_buffer_prototype from zarr.core.buffer.cpu import NDBuffer @@ -1893,3 +1895,25 @@ def test_chunk_encoding_no_object_codec_errors(dtype: ZDType[Any, Any]) -> None: ) with pytest.raises(ValueError, match=re.escape(msg)): _parse_chunk_encoding_v2(filters=None, compressor=None, dtype=dtype) + + +def test_unknown_object_codec_default_serializer_v3() -> None: + """ + Test that we get a valueerrror when trying to create the default serializer for a data type + that requires an unknown object codec + """ + dtype = UnknownObjectDtype() + msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}." + with pytest.raises(ValueError, match=re.escape(msg)): + default_serializer_v3(dtype) + + +def test_unknown_object_codec_default_filters_v2() -> None: + """ + Test that we get a valueerrror when trying to create the default serializer for a data type + that requires an unknown object codec + """ + dtype = UnknownObjectDtype() + msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}." + with pytest.raises(ValueError, match=re.escape(msg)): + default_filters_v2(dtype)