From abbf846cab90f418d0ed3c33f43560904a083aec Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Tue, 18 Feb 2025 15:18:37 +0100 Subject: [PATCH 1/6] adds extensions that zarr-python defines --- codecs/vlen-bytes/README.md | 40 +++++++++++++++++++++++++++++++++++ codecs/vlen-bytes/schema.json | 21 ++++++++++++++++++ codecs/vlen-utf8/README.md | 40 +++++++++++++++++++++++++++++++++++ codecs/vlen-utf8/schema.json | 21 ++++++++++++++++++ data-types/bytes/README.md | 28 ++++++++++++++++++++++++ data-types/bytes/schema.json | 21 ++++++++++++++++++ data-types/string/README.md | 28 ++++++++++++++++++++++++ data-types/string/schema.json | 21 ++++++++++++++++++ 8 files changed, 220 insertions(+) create mode 100644 codecs/vlen-bytes/README.md create mode 100644 codecs/vlen-bytes/schema.json create mode 100644 codecs/vlen-utf8/README.md create mode 100644 codecs/vlen-utf8/schema.json create mode 100644 data-types/bytes/README.md create mode 100644 data-types/bytes/schema.json create mode 100644 data-types/string/README.md create mode 100644 data-types/string/schema.json diff --git a/codecs/vlen-bytes/README.md b/codecs/vlen-bytes/README.md new file mode 100644 index 0000000..7108272 --- /dev/null +++ b/codecs/vlen-bytes/README.md @@ -0,0 +1,40 @@ +# Vlen-bytes codec + +Defines an `array -> bytes` codec that serializes variable-length byte string arrays. + +## Codec name + +The value of the `name` member in the codec object MUST be `vlen-bytes`. + +## Configuration parameters + +None. + +## Example + +For example, the array metadata below specifies that the array contains variable-length byte strings: + +```json +{ + "data_type": "bytes", + "codecs": [{ + "name": "vlen-bytes" + }], +} +``` + +## Format and algorithm + +This is a `array -> bytes` codec. + +This codec is only compatible with the [`"bytes"`](../../data-types/bytes/README.md) data type. + +See https://numcodecs.readthedocs.io/en/stable/other/vlen.html#vlenbytes for details about the encoding. + +## Change log + +No changes yet. + +## Current maintainers + +* [zarr-python core development team](https://github.com/orgs/zarr-developers/teams/python-core-devs) diff --git a/codecs/vlen-bytes/schema.json b/codecs/vlen-bytes/schema.json new file mode 100644 index 0000000..d7adf15 --- /dev/null +++ b/codecs/vlen-bytes/schema.json @@ -0,0 +1,21 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string", + "enum": ["vlen-bytes"] + }, + "configuration": { + "type": "object", + "additionalProperties": false + } + }, + "required": ["name"], + "additionalProperties": false + }, + { "type": "string", "enum": ["vlen-bytes"] } + ] +} diff --git a/codecs/vlen-utf8/README.md b/codecs/vlen-utf8/README.md new file mode 100644 index 0000000..ea6b59c --- /dev/null +++ b/codecs/vlen-utf8/README.md @@ -0,0 +1,40 @@ +# Vlen-utf8 codec + +Defines an `array -> bytes` codec that serializes variable-length UTF8 string arrays. + +## Codec name + +The value of the `name` member in the codec object MUST be `vlen-utf8`. + +## Configuration parameters + +None. + +## Example + +For example, the array metadata below specifies that the array contains variable-length UTF8 strings: + +```json +{ + "data_type": "string", + "codecs": [{ + "name": "vlen-utf8" + }], +} +``` + +## Format and algorithm + +This is a `array -> bytes` codec. + +This codec is only compatible with the [`"string"`](../../data-types/string/README.md) data type. + +See https://numcodecs.readthedocs.io/en/stable/other/vlen.html#vlenutf8 for details about the encoding. + +## Change log + +No changes yet. + +## Current maintainers + +* [zarr-python core development team](https://github.com/orgs/zarr-developers/teams/python-core-devs) diff --git a/codecs/vlen-utf8/schema.json b/codecs/vlen-utf8/schema.json new file mode 100644 index 0000000..97cec90 --- /dev/null +++ b/codecs/vlen-utf8/schema.json @@ -0,0 +1,21 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string", + "enum": ["vlen-utf8"] + }, + "configuration": { + "type": "object", + "additionalProperties": false + } + }, + "required": ["name"], + "additionalProperties": false + }, + { "type": "string", "enum": ["vlen-utf8"] } + ] +} diff --git a/data-types/bytes/README.md b/data-types/bytes/README.md new file mode 100644 index 0000000..13482ba --- /dev/null +++ b/data-types/bytes/README.md @@ -0,0 +1,28 @@ +# Bytes data type + +Defines a data type for variable-length byte strings. + +## Example + +For example, the array metadata below specifies that the array contains variable-length byte strings: + +```json +{ + "data_type": "bytes", + "codecs": [{ + "name": "vlen-bytes" + }], +} +``` + +## Notes + +Currently, this data type is only compatible with the [`"vlen-bytes"`](../../codecs/vlen-bytes/README.md) codec. + +## Change log + +No changes yet. + +## Current maintainers + +* [zarr-python core development team](https://github.com/orgs/zarr-developers/teams/python-core-devs) diff --git a/data-types/bytes/schema.json b/data-types/bytes/schema.json new file mode 100644 index 0000000..6b1547d --- /dev/null +++ b/data-types/bytes/schema.json @@ -0,0 +1,21 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string", + "enum": ["bytes"] + }, + "configuration": { + "type": "object", + "additionalProperties": false + } + }, + "required": ["name"], + "additionalProperties": false + }, + { "type": "string", "enum": ["bytes"] } + ] +} diff --git a/data-types/string/README.md b/data-types/string/README.md new file mode 100644 index 0000000..812f502 --- /dev/null +++ b/data-types/string/README.md @@ -0,0 +1,28 @@ +# String data type + +Defines a data type for variable-length UTF8 strings. + +## Example + +For example, the array metadata below specifies that the array contains variable-length byte strings: + +```json +{ + "data_type": "string", + "codecs": [{ + "name": "vlen-utf8" + }], +} +``` + +## Notes + +Currently, this data type is only compatible with the [`"vlen-utf8"`](../../codecs/vlen-utf8/README.md) codec. + +## Change log + +No changes yet. + +## Current maintainers + +* [zarr-python core development team](https://github.com/orgs/zarr-developers/teams/python-core-devs) diff --git a/data-types/string/schema.json b/data-types/string/schema.json new file mode 100644 index 0000000..48e18cf --- /dev/null +++ b/data-types/string/schema.json @@ -0,0 +1,21 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string", + "enum": ["string"] + }, + "configuration": { + "type": "object", + "additionalProperties": false + } + }, + "required": ["name"], + "additionalProperties": false + }, + { "type": "string", "enum": ["string"] } + ] +} From 8e19ccde8e07603b501fb14fa3d2e4e8fa531717 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Mon, 24 Feb 2025 16:23:10 +0100 Subject: [PATCH 2/6] fill_values --- data-types/bytes/README.md | 5 +++++ data-types/string/README.md | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/data-types/bytes/README.md b/data-types/bytes/README.md index 13482ba..469fc93 100644 --- a/data-types/bytes/README.md +++ b/data-types/bytes/README.md @@ -2,6 +2,10 @@ Defines a data type for variable-length byte strings. +## Permitted fill values + +The value of the `fill_value` metadata key must be an array of byte values. + ## Example For example, the array metadata below specifies that the array contains variable-length byte strings: @@ -9,6 +13,7 @@ For example, the array metadata below specifies that the array contains variable ```json { "data_type": "bytes", + "fill_value": [1, 2, 3], "codecs": [{ "name": "vlen-bytes" }], diff --git a/data-types/string/README.md b/data-types/string/README.md index 812f502..3ee034f 100644 --- a/data-types/string/README.md +++ b/data-types/string/README.md @@ -2,6 +2,10 @@ Defines a data type for variable-length UTF8 strings. +## Permitted fill values + +The value of the `fill_value` metadata key must be unicode string. + ## Example For example, the array metadata below specifies that the array contains variable-length byte strings: @@ -9,6 +13,7 @@ For example, the array metadata below specifies that the array contains variable ```json { "data_type": "string", + "fill_value": "foo", "codecs": [{ "name": "vlen-utf8" }], From 9dcdb66497dcfc5f4e34479a22ce9feb64a2f04e Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 10 Apr 2025 15:57:48 +0200 Subject: [PATCH 3/6] update schema --- codecs/vlen-bytes/schema.json | 7 +++---- codecs/vlen-utf8/schema.json | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/codecs/vlen-bytes/schema.json b/codecs/vlen-bytes/schema.json index d7adf15..021eecb 100644 --- a/codecs/vlen-bytes/schema.json +++ b/codecs/vlen-bytes/schema.json @@ -1,12 +1,11 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "anyOf": [ + "oneOf": [ { "type": "object", "properties": { "name": { - "type": "string", - "enum": ["vlen-bytes"] + "const": "vlen-bytes" }, "configuration": { "type": "object", @@ -16,6 +15,6 @@ "required": ["name"], "additionalProperties": false }, - { "type": "string", "enum": ["vlen-bytes"] } + { "const": "vlen-bytes" } ] } diff --git a/codecs/vlen-utf8/schema.json b/codecs/vlen-utf8/schema.json index 97cec90..542126b 100644 --- a/codecs/vlen-utf8/schema.json +++ b/codecs/vlen-utf8/schema.json @@ -1,12 +1,11 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "anyOf": [ + "oneOf": [ { "type": "object", "properties": { "name": { - "type": "string", - "enum": ["vlen-utf8"] + "const": "vlen-utf8" }, "configuration": { "type": "object", @@ -16,6 +15,6 @@ "required": ["name"], "additionalProperties": false }, - { "type": "string", "enum": ["vlen-utf8"] } + { "const": "vlen-utf8" } ] } From 02b24bb1077ccefd3cc864b0611dac1c0a8cc6cb Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 10 Apr 2025 15:58:56 +0200 Subject: [PATCH 4/6] update schema --- data-types/bytes/schema.json | 7 +++---- data-types/string/schema.json | 5 ++--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/data-types/bytes/schema.json b/data-types/bytes/schema.json index 6b1547d..d9deb92 100644 --- a/data-types/bytes/schema.json +++ b/data-types/bytes/schema.json @@ -1,12 +1,11 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "anyOf": [ + "oneOf": [ { "type": "object", "properties": { "name": { - "type": "string", - "enum": ["bytes"] + "const": "bytes" }, "configuration": { "type": "object", @@ -16,6 +15,6 @@ "required": ["name"], "additionalProperties": false }, - { "type": "string", "enum": ["bytes"] } + { "const": "bytes" } ] } diff --git a/data-types/string/schema.json b/data-types/string/schema.json index 48e18cf..6e42719 100644 --- a/data-types/string/schema.json +++ b/data-types/string/schema.json @@ -5,8 +5,7 @@ "type": "object", "properties": { "name": { - "type": "string", - "enum": ["string"] + "const": "string" }, "configuration": { "type": "object", @@ -16,6 +15,6 @@ "required": ["name"], "additionalProperties": false }, - { "type": "string", "enum": ["string"] } + { "const": "string" } ] } From 134c59bc8f9a959cc11a0ca1020296cd3cbc44dd Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 17 Apr 2025 15:44:02 +0200 Subject: [PATCH 5/6] oneOf --- data-types/string/schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-types/string/schema.json b/data-types/string/schema.json index 6e42719..58366e7 100644 --- a/data-types/string/schema.json +++ b/data-types/string/schema.json @@ -1,6 +1,6 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "anyOf": [ + "oneOf": [ { "type": "object", "properties": { From 4f3528342734a2dd33ec4e4cbcdb61375a7d08c5 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 18 Apr 2025 13:19:36 +0200 Subject: [PATCH 6/6] better spec --- codecs/vlen-bytes/README.md | 4 ++++ codecs/vlen-utf8/README.md | 9 +++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/codecs/vlen-bytes/README.md b/codecs/vlen-bytes/README.md index 7108272..d2d6b22 100644 --- a/codecs/vlen-bytes/README.md +++ b/codecs/vlen-bytes/README.md @@ -29,6 +29,10 @@ This is a `array -> bytes` codec. This codec is only compatible with the [`"bytes"`](../../data-types/bytes/README.md) data type. +In the encoded format, each chunk is prefixed with a 32-bit little-endian unsigned integer (u32le) that specifies the number of elements in the chunk. +This prefix is followed by a sequence of encoded elements in lexicographical order. +Each element in the sequence is encoded by a u32le representing the number of bytes followed by the bytes themselves. + See https://numcodecs.readthedocs.io/en/stable/other/vlen.html#vlenbytes for details about the encoding. ## Change log diff --git a/codecs/vlen-utf8/README.md b/codecs/vlen-utf8/README.md index ea6b59c..2b6e830 100644 --- a/codecs/vlen-utf8/README.md +++ b/codecs/vlen-utf8/README.md @@ -1,6 +1,6 @@ # Vlen-utf8 codec -Defines an `array -> bytes` codec that serializes variable-length UTF8 string arrays. +Defines an `array -> bytes` codec that serializes variable-length UTF-8 string arrays. ## Codec name @@ -12,7 +12,7 @@ None. ## Example -For example, the array metadata below specifies that the array contains variable-length UTF8 strings: +For example, the array metadata below specifies that the array contains variable-length UTF-8 strings: ```json { @@ -29,6 +29,11 @@ This is a `array -> bytes` codec. This codec is only compatible with the [`"string"`](../../data-types/string/README.md) data type. +In the encoded format, each chunk is prefixed with a 32-bit little-endian unsigned integer (u32le) that specifies the number of elements in the chunk. +This prefix is followed by a sequence of encoded elements in lexicographical order. +Each element in the sequence is encoded by a u32le representing the number of bytes followed by the bytes themselves. +The bytes for each element are obtained by encoding the element as UTF8 bytes. + See https://numcodecs.readthedocs.io/en/stable/other/vlen.html#vlenutf8 for details about the encoding. ## Change log