From 5e8eeaa19cdd3c73992a6ee328a75cad48bb733a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 8 Apr 2026 10:30:39 +0200 Subject: [PATCH 01/10] wip --- src/zarr/core/array_spec.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index 2b5eb0191c..e3b6bfd023 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections.abc import Mapping from dataclasses import dataclass, fields from typing import TYPE_CHECKING, Any, Literal, Self, TypedDict, cast @@ -38,6 +39,23 @@ class ArrayConfig: Parameters ---------- + order : MemoryOrder + The memory layout of the arrays returned when reading data from the store. + write_empty_chunks : bool + If True, empty chunks will be written to the store. + read_missing_chunks : bool, default is True + If True, missing chunks will be filled with the array's fill value on read. + If False, reading missing chunks will raise a ``ChunkNotFoundError``. + codec_classes : Mapping[str, object] | None, default is None + A codec name : codec class mapping that defines the codec classes available + for this array. Defaults to `None`, in which case a default collection of codecs + is retrieved from the global config object. + data_type_classes : set[ZDType] | None, default is None. + A set of data type classes to use + A data type identi + + Attributes + ---------- order : MemoryOrder The memory layout of the arrays returned when reading data from the store. write_empty_chunks : bool @@ -45,14 +63,27 @@ class ArrayConfig: read_missing_chunks : bool If True, missing chunks will be filled with the array's fill value on read. If False, reading missing chunks will raise a ``ChunkNotFoundError``. + codec_classes : Mapping[str, object] + A codec name : codec class mapping that defines the codec classes available + for this array. + data_type_clas """ order: MemoryOrder write_empty_chunks: bool read_missing_chunks: bool + codec_classes: Mapping[str, object] + data_type_classes: set[ZDType[Any, Any]] + codec_pipeline_class: object def __init__( - self, order: MemoryOrder, write_empty_chunks: bool, *, read_missing_chunks: bool = True + self, + order: MemoryOrder, + write_empty_chunks: bool, + *, + read_missing_chunks: bool = True, + codec_class_map: Mapping[str, object] | None = None, + codec_pipeline_class: object | None = None, ) -> None: order_parsed = parse_order(order) write_empty_chunks_parsed = parse_bool(write_empty_chunks) From 92691203193794095784bd0596f8bf8d99e5d408 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 8 Apr 2026 21:21:02 +0200 Subject: [PATCH 02/10] feat: define the codec name: codec class mapping as part of array runtime configuration --- src/zarr/codecs/sharding.py | 11 +- src/zarr/core/array.py | 25 ++-- src/zarr/core/array_spec.py | 179 ++++++++++++++++++++++++---- src/zarr/core/metadata/v2.py | 9 +- src/zarr/core/metadata/v3.py | 28 +++-- tests/test_array.py | 21 +++- tests/test_codecs/test_blosc.py | 4 +- tests/test_codecs/test_crc32c.py | 4 +- tests/test_codecs/test_endian.py | 4 +- tests/test_codecs/test_gzip.py | 4 +- tests/test_codecs/test_transpose.py | 4 +- tests/test_codecs/test_zstd.py | 4 +- tests/test_sync_codec_pipeline.py | 4 +- 13 files changed, 239 insertions(+), 62 deletions(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 609e32f87d..dee7af7583 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -26,7 +26,7 @@ ) from zarr.codecs.bytes import BytesCodec from zarr.codecs.crc32c_ import Crc32cCodec -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArraySpec, ArraySpecConfig, parse_codec_class_map from zarr.core.buffer import ( Buffer, BufferPrototype, @@ -319,10 +319,13 @@ def __init__( codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(),), index_codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(), Crc32cCodec()), index_location: ShardingCodecIndexLocation | str = ShardingCodecIndexLocation.end, + codec_class_map: Mapping[str, type[Codec]] | None = None, ) -> None: + if codec_class_map is None: + codec_class_map = parse_codec_class_map(None) chunk_shape_parsed = parse_shapelike(chunk_shape) - codecs_parsed = parse_codecs(codecs) - index_codecs_parsed = parse_codecs(index_codecs) + codecs_parsed = parse_codecs(codecs, codec_class_map=codec_class_map) + index_codecs_parsed = parse_codecs(index_codecs, codec_class_map=codec_class_map) index_location_parsed = parse_index_location(index_location) object.__setattr__(self, "chunk_shape", chunk_shape_parsed) @@ -737,7 +740,7 @@ def _get_index_chunk_spec(self, chunks_per_shard: tuple[int, ...]) -> ArraySpec: shape=chunks_per_shard + (2,), dtype=UInt64(endianness="little"), fill_value=MAX_UINT_64, - config=ArrayConfig( + config=ArraySpecConfig( order="C", write_empty_chunks=False ), # Note: this is hard-coded for simplicity -- it is not surfaced into user code, prototype=default_buffer_prototype(), diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 4736805b9d..08ee52a068 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -28,7 +28,13 @@ from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec from zarr.core._info import ArrayInfo -from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArraySpec, parse_array_config +from zarr.core.array_spec import ( + ArrayConfig, + ArrayConfigLike, + ArraySpec, + ArraySpecConfig, + parse_array_config, +) from zarr.core.attributes import Attributes from zarr.core.buffer import ( BufferPrototype, @@ -197,13 +203,13 @@ def _chunk_sizes_from_shape( return tuple(result) -def parse_array_metadata(data: Any) -> ArrayMetadata: +def parse_array_metadata(data: object, config: ArrayConfig) -> ArrayMetadata: if isinstance(data, ArrayMetadata): - return data + return data.with_config(config) elif isinstance(data, dict): zarr_format = data.get("zarr_format") if zarr_format == 3: - meta_out = ArrayV3Metadata.from_dict(data) + meta_out = ArrayV3Metadata.from_dict(data, config=config) if len(meta_out.storage_transformers) > 0: msg = ( f"Array metadata contains storage transformers: {meta_out.storage_transformers}." @@ -212,7 +218,7 @@ def parse_array_metadata(data: Any) -> ArrayMetadata: raise ValueError(msg) return meta_out elif zarr_format == 2: - return ArrayV2Metadata.from_dict(data) + return ArrayV2Metadata.from_dict(data, config=config) else: raise ValueError(f"Invalid zarr_format: {zarr_format}. Expected 2 or 3") raise TypeError # pragma: no cover @@ -353,8 +359,8 @@ def __init__( store_path: StorePath, config: ArrayConfigLike | None = None, ) -> None: - metadata_parsed = parse_array_metadata(metadata) config_parsed = parse_array_config(config) + metadata_parsed = parse_array_metadata(metadata, config=config_parsed) object.__setattr__(self, "metadata", metadata_parsed) object.__setattr__(self, "store_path", store_path) @@ -5769,11 +5775,16 @@ def _get_chunk_spec( spec = chunk_grid[chunk_coords] if spec is None: raise IndexError(f"Chunk coordinates {chunk_coords} are out of bounds.") + spec_config = ArraySpecConfig( + order=array_config.order, + read_missing_chunks=array_config.read_missing_chunks, + write_empty_chunks=array_config.write_empty_chunks, + ) return ArraySpec( shape=spec.codec_shape, dtype=metadata.dtype, fill_value=metadata.fill_value, - config=array_config, + config=spec_config, prototype=prototype, ) diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index e3b6bfd023..5617a24d01 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -1,9 +1,9 @@ from __future__ import annotations -from collections.abc import Mapping from dataclasses import dataclass, fields -from typing import TYPE_CHECKING, Any, Literal, Self, TypedDict, cast +from typing import TYPE_CHECKING, Any, Final, Literal, Self, TypedDict, cast +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.core.common import ( MemoryOrder, parse_bool, @@ -14,13 +14,35 @@ from zarr.core.config import config as zarr_config if TYPE_CHECKING: + from collections.abc import Mapping from typing import NotRequired from zarr.core.buffer import BufferPrototype from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType +class CodecPipelineRequest(TypedDict): + """ + A dictionary model of a request for a codec pipeline. + """ + + class_path: str + options: NotRequired[dict[str, object]] + + class ArrayConfigParams(TypedDict): + """ + A TypedDict model of the attributes of an ArrayConfig class. + """ + + order: MemoryOrder + write_empty_chunks: bool + read_missing_chunks: bool + codec_class_map: Mapping[str, object] + codec_pipeline_class: CodecPipelineRequest + + +class ArrayConfigRequest(TypedDict): """ A TypedDict model of the attributes of an ArrayConfig class, but with no required fields. This allows for partial construction of an ArrayConfig, with the assumption that the unset @@ -30,6 +52,29 @@ class ArrayConfigParams(TypedDict): order: NotRequired[MemoryOrder] write_empty_chunks: NotRequired[bool] read_missing_chunks: NotRequired[bool] + codec_class_map: NotRequired[ + Mapping[str, type[ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec]] + ] + codec_pipeline_class: NotRequired[CodecPipelineRequest] + + +ArrayConfigKeys = Literal[ + "order", "write_empty_chunks", "read_missing_chunks", "codec_class_map", "codec_pipeline_class" +] + +ARRAY_CONFIG_PARAMS_KEYS: Final[set[str]] = { + "order", + "write_empty_chunks", + "read_missing_chunks", + "codec_class_map", + "codec_pipeline_class", +} +ARRAY_CONFIG_PARAMS_KEYS_STATIC: Final[set[str]] = { + "order", + "write_empty_chunks", + "read_missing_chunks", +} +"""The keys of the ArrayConfigParams object that are static and retrievable from the config""" @dataclass(frozen=True) @@ -46,13 +91,14 @@ class ArrayConfig: read_missing_chunks : bool, default is True If True, missing chunks will be filled with the array's fill value on read. If False, reading missing chunks will raise a ``ChunkNotFoundError``. - codec_classes : Mapping[str, object] | None, default is None - A codec name : codec class mapping that defines the codec classes available - for this array. Defaults to `None`, in which case a default collection of codecs + codec_class_map : Mapping[str, object] | None, default is None + A request for a codec name : codec class mapping that defines the codec classes available + for array creation. Defaults to `None`, in which case a default collection of codecs is retrieved from the global config object. - data_type_classes : set[ZDType] | None, default is None. - A set of data type classes to use - A data type identi + codec_pipeline_class : CodecPipelineRequest | None, default = None + A request for a codec pipeline class to be used for orchestrating chunk encoding and + decoding. Defaults to `None`, in which case the default codec pipeline request + is retrieved from information in the global config object. Attributes ---------- @@ -63,18 +109,19 @@ class ArrayConfig: read_missing_chunks : bool If True, missing chunks will be filled with the array's fill value on read. If False, reading missing chunks will raise a ``ChunkNotFoundError``. - codec_classes : Mapping[str, object] + codec_class_map : Mapping[str, object] A codec name : codec class mapping that defines the codec classes available - for this array. - data_type_clas + for array creation. + codec_pipeline_class : CodecPipelineRequest + A request for a pipeline class that will be used for orchestrating chunk encoding and + decoding. """ order: MemoryOrder write_empty_chunks: bool read_missing_chunks: bool - codec_classes: Mapping[str, object] - data_type_classes: set[ZDType[Any, Any]] - codec_pipeline_class: object + codec_class_map: Mapping[str, type[Codec]] + codec_pipeline_class: CodecPipelineRequest def __init__( self, @@ -82,31 +129,42 @@ def __init__( write_empty_chunks: bool, *, read_missing_chunks: bool = True, - codec_class_map: Mapping[str, object] | None = None, - codec_pipeline_class: object | None = None, + codec_class_map: Mapping[str, type[ArrayBytesCodec | ArrayArrayCodec | BytesBytesCodec]] + | None = None, + codec_pipeline_class: CodecPipelineRequest | None = None, ) -> None: order_parsed = parse_order(order) write_empty_chunks_parsed = parse_bool(write_empty_chunks) read_missing_chunks_parsed = parse_bool(read_missing_chunks) + codec_class_map_parsed = parse_codec_class_map(codec_class_map) + codec_pipeline_class_parsed = parse_codec_pipeline_class(codec_pipeline_class) object.__setattr__(self, "order", order_parsed) object.__setattr__(self, "write_empty_chunks", write_empty_chunks_parsed) object.__setattr__(self, "read_missing_chunks", read_missing_chunks_parsed) + object.__setattr__(self, "codec_class_map", codec_class_map_parsed) + object.__setattr__(self, "codec_pipeline_class", codec_pipeline_class_parsed) @classmethod - def from_dict(cls, data: ArrayConfigParams) -> Self: + def from_dict(cls, data: ArrayConfigRequest) -> Self: """ Create an ArrayConfig from a dict. The keys of that dict are a subset of the attributes of the ArrayConfig class. Any keys missing from that dict will be set to the the values in the ``array`` namespace of ``zarr.config``. """ - kwargs_out: ArrayConfigParams = {} + kwargs_out: ArrayConfigRequest = {} for f in fields(ArrayConfig): field_name = cast( - "Literal['order', 'write_empty_chunks', 'read_missing_chunks']", f.name + "Literal['order', 'write_empty_chunks', 'read_missing_chunks', 'codec_class_map', 'codec_pipeline_class']", + f.name, ) if field_name not in data: - kwargs_out[field_name] = zarr_config.get(f"array.{field_name}") + if field_name in ARRAY_CONFIG_PARAMS_KEYS_STATIC: + kwargs_out[field_name] = zarr_config.get(f"array.{field_name}") + elif field_name == "codec_class_map": + kwargs_out["codec_class_map"] = parse_codec_class_map(None) + elif field_name == "codec_pipeline_class": + kwargs_out["codec_pipeline_class"] = parse_codec_pipeline_class(None) else: kwargs_out[field_name] = data[field_name] return cls(**kwargs_out) @@ -119,10 +177,76 @@ def to_dict(self) -> ArrayConfigParams: "order": self.order, "write_empty_chunks": self.write_empty_chunks, "read_missing_chunks": self.read_missing_chunks, + "codec_class_map": self.codec_class_map, + "codec_pipeline_class": self.codec_pipeline_class, } -ArrayConfigLike = ArrayConfig | ArrayConfigParams +ArrayConfigLike = ArrayConfig | ArrayConfigRequest + + +def _import_by_name(path: str) -> object | type: + """ + Import an object by its fully qualified name. + """ + import importlib + + parts = path.split(".") + + # Try progressively shorter module paths + for i in range(len(parts), 0, -1): + module_path = ".".join(parts[:i]) + try: + module = importlib.import_module(module_path) + break + except ModuleNotFoundError: + continue + else: + raise ImportError(f"Could not import any module from '{path}'") + + obj = module + for attr in parts[i:]: + try: + obj = getattr(obj, attr) + except AttributeError as e: + raise ImportError(f"Attribute '{attr}' not found in '{obj}'") from e + return obj + + +def parse_codec_pipeline_class(obj: CodecPipelineRequest | None) -> CodecPipelineRequest: + if obj is None: + config_entry: dict[str, str | int] = zarr_config.get("codec_pipeline") + if "path" not in config_entry: + msg = ( + "The codec_pipeline field in the global config is malformed. " + "Expected 'path' key was not found." + ) + raise KeyError(msg) + else: + path = config_entry["path"] + options = {"batch_size": config_entry.get("batch_size", 1)} + return {"class_path": path, "options": options} + return obj + + +def parse_codec_class_map(obj: Mapping[str, type[Codec]] | None) -> Mapping[str, type[Codec]]: + """ + Convert a request for a codec class map into an actual Mapping[str, type[Codec]]. + If the input is `None`, then we look up the list of codecs from the registry, where they + are stored as fully qualified class names. We must resolve these names to concrete classes + before inserting them into the returned mapping. + """ + if obj is None: + name_map: dict[str, str] = zarr_config.get("codecs", {}) + out: dict[str, type[Codec]] = {} + for key, value in name_map.items(): + maybe_cls = _import_by_name(value) + if not issubclass(maybe_cls, Codec): + msg = f"Expected a subclass of `Codec`, got {maybe_cls}" + raise TypeError(msg) + out[key] = maybe_cls + return out + return obj def parse_array_config(data: ArrayConfigLike | None) -> ArrayConfig: @@ -137,12 +261,19 @@ def parse_array_config(data: ArrayConfigLike | None) -> ArrayConfig: return ArrayConfig.from_dict(data) +@dataclass(frozen=True) +class ArraySpecConfig: + order: MemoryOrder + write_empty_chunks: bool + read_missing_chunks: bool = False + + @dataclass(frozen=True) class ArraySpec: shape: tuple[int, ...] dtype: ZDType[TBaseDType, TBaseScalar] fill_value: Any - config: ArrayConfig + config: ArraySpecConfig prototype: BufferPrototype def __init__( @@ -150,12 +281,12 @@ def __init__( shape: tuple[int, ...], dtype: ZDType[TBaseDType, TBaseScalar], fill_value: Any, - config: ArrayConfig, + config: ArraySpecConfig, prototype: BufferPrototype, ) -> None: shape_parsed = parse_shapelike(shape) fill_value_parsed = parse_fill_value(fill_value) - + assert isinstance(config, ArraySpecConfig) object.__setattr__(self, "shape", shape_parsed) object.__setattr__(self, "dtype", dtype) object.__setattr__(self, "fill_value", fill_value_parsed) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 8626d480a7..ce66e2368b 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -30,7 +30,7 @@ import numpy as np -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArrayConfig, ArraySpec, ArraySpecConfig from zarr.core.chunk_key_encodings import parse_separator from zarr.core.common import ( JSON, @@ -242,11 +242,16 @@ def to_dict(self) -> dict[str, JSON]: def get_chunk_spec( self, _chunk_coords: tuple[int, ...], array_config: ArrayConfig, prototype: BufferPrototype ) -> ArraySpec: + spec_config = ArraySpecConfig( + order=array_config.order, + read_missing_chunks=array_config.read_missing_chunks, + write_empty_chunks=array_config.write_empty_chunks, + ) return ArraySpec( shape=self.chunks, dtype=self.dtype, fill_value=self.fill_value, - config=array_config, + config=spec_config, prototype=prototype, ) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 7773e2489d..aea04359b7 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -7,7 +7,7 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.metadata import Metadata -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArrayConfig, ArraySpec, ArraySpecConfig from zarr.core.buffer.core import default_buffer_prototype from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, @@ -33,7 +33,6 @@ from zarr.core.dtype.common import check_dtype_spec_v3 from zarr.core.metadata.common import parse_attributes from zarr.errors import MetadataValidationError, NodeTypeValidationError, UnknownCodecError -from zarr.registry import get_codec_class if TYPE_CHECKING: from typing import Self @@ -56,7 +55,7 @@ def parse_node_type_array(data: object) -> Literal["array"]: raise NodeTypeValidationError(msg) -def parse_codecs(data: object) -> tuple[Codec, ...]: +def parse_codecs(data: object, codec_class_map: Mapping[str, type[Codec]]) -> tuple[Codec, ...]: out: tuple[Codec, ...] = () if not isinstance(data, Iterable): @@ -71,7 +70,7 @@ def parse_codecs(data: object) -> tuple[Codec, ...]: name_parsed, _ = parse_named_configuration(c, require_configuration=False) try: - out += (get_codec_class(name_parsed).from_dict(c),) + out += (codec_class_map[name_parsed].from_dict(c),) except KeyError as e: raise UnknownCodecError(f"Unknown codec: {e.args[0]!r}") from e @@ -460,11 +459,14 @@ def __init__( dimension_names: DimensionNamesLike, storage_transformers: Iterable[dict[str, JSON]] | None = None, extra_fields: Mapping[str, AllowedExtraField] | None = None, + _config: ArrayConfig | None = None, ) -> None: """ Because the class is a frozen dataclass, we set attributes using object.__setattr__ """ - + # generate a new default config if _config is `None` + if _config is None: + _config = ArrayConfig.from_dict({}) shape_parsed = parse_shapelike(shape) chunk_grid_parsed = parse_chunk_grid(chunk_grid) chunk_key_encoding_parsed = parse_chunk_key_encoding(chunk_key_encoding) @@ -472,14 +474,16 @@ def __init__( # Note: relying on a type method is numpy-specific fill_value_parsed = data_type.cast_scalar(fill_value) attributes_parsed = parse_attributes(attributes) - codecs_parsed_partial = parse_codecs(codecs) + codecs_parsed_partial = parse_codecs(codecs, _config.codec_class_map) storage_transformers_parsed = parse_storage_transformers(storage_transformers) extra_fields_parsed = parse_extra_fields(extra_fields) array_spec = ArraySpec( shape=shape_parsed, dtype=data_type, fill_value=fill_value_parsed, - config=ArrayConfig.from_dict({}), # TODO: config is not needed here. + config=ArraySpecConfig( + write_empty_chunks=_config.write_empty_chunks, order=_config.order + ), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. ) codecs_parsed = tuple(c.evolve_from_array_spec(array_spec) for c in codecs_parsed_partial) @@ -495,6 +499,7 @@ def __init__( object.__setattr__(self, "attributes", attributes_parsed) object.__setattr__(self, "storage_transformers", storage_transformers_parsed) object.__setattr__(self, "extra_fields", extra_fields_parsed) + object.__setattr__(self, "_config", _config) self._validate_metadata() @@ -573,7 +578,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: } @classmethod - def from_dict(cls, data: dict[str, JSON]) -> Self: + def from_dict(cls, data: dict[str, JSON], *, config: ArrayConfig | None = None) -> Self: # make a copy because we are modifying the dict _data = data.copy() @@ -626,6 +631,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: data_type=data_type, extra_fields=allowed_extra_fields, storage_transformers=_data_typed.get("storage_transformers", ()), # type: ignore[arg-type] + _config=config, ) def to_dict(self) -> dict[str, JSON]: @@ -663,3 +669,9 @@ def update_shape(self, shape: tuple[int, ...]) -> Self: def update_attributes(self, attributes: dict[str, JSON]) -> Self: return replace(self, attributes=attributes) + + def with_config(self, config: ArrayConfig | None) -> Self: + """ + Return a copy of this metadata with a new configuration object. + """ + return type(self).from_dict(self.to_dict(), config=config) diff --git a/tests/test_array.py b/tests/test_array.py index f7f564f30e..7bbb9370fe 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -44,7 +44,7 @@ default_filters_v2, default_serializer_v3, ) -from zarr.core.array_spec import ArrayConfig, ArrayConfigParams +from zarr.core.array_spec import ArrayConfig, ArrayConfigRequest from zarr.core.buffer import NDArrayLike, NDArrayLikeOrScalar, default_buffer_prototype from zarr.core.chunk_grids import _auto_partition from zarr.core.chunk_key_encodings import ChunkKeyEncodingParams @@ -2287,13 +2287,13 @@ def test_shard_write_num_gets(selection: slice, expected_gets: int) -> None: @pytest.mark.parametrize("config", [{}, {"write_empty_chunks": True}, {"order": "C"}]) -def test_with_config(config: ArrayConfigParams) -> None: +def test_with_config(config: ArrayConfigRequest) -> None: """ Test that `AsyncArray.with_config` and `Array.with_config` create a copy of the source array with a new runtime configuration. """ # the config we start with - source_config: ArrayConfigParams = {"write_empty_chunks": False, "order": "F"} + source_config: ArrayConfigRequest = {"write_empty_chunks": False, "order": "F"} source_array = zarr.create_array({}, shape=(1,), dtype="uint8", config=source_config) new_async_array_config_dict = source_array._async_array.with_config(config).config.to_dict() @@ -2321,3 +2321,18 @@ def test_with_config_polymorphism() -> None: arr_source_config_dict = arr.with_config(source_config_dict) assert arr_source_config.config == arr_source_config_dict.config + + +def test_array_config_specify_codecs() -> None: + """ + Test that we can use the array config to define the codec classes available to the array + """ + + class FakeGzipCodec(GzipCodec): ... + + store = {} + arr = zarr.create_array(store, shape=(1,), dtype="uint8", compressors=GzipCodec()) + arr_2 = arr.with_config( + {"codec_class_map": {**arr.config.codec_class_map, "gzip": FakeGzipCodec}} + ) + assert isinstance(arr_2.compressors[0], FakeGzipCodec) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index 0201beb8de..717ac4574a 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -9,7 +9,7 @@ from zarr.abc.codec import SupportsSyncCodec from zarr.codecs import BloscCodec from zarr.codecs.blosc import BloscShuffle, Shuffle -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArraySpec, ArraySpecConfig from zarr.core.buffer import default_buffer_prototype from zarr.core.dtype import UInt16, get_data_type_from_native_dtype from zarr.storage import MemoryStore, StorePath @@ -125,7 +125,7 @@ def test_blosc_codec_sync_roundtrip() -> None: shape=arr.shape, dtype=zdtype, fill_value=zdtype.cast_scalar(0), - config=ArrayConfig(order="C", write_empty_chunks=True), + config=ArraySpecConfig(order="C", write_empty_chunks=True), prototype=default_buffer_prototype(), ) buf = default_buffer_prototype().buffer.from_array_like(arr.view("B")) diff --git a/tests/test_codecs/test_crc32c.py b/tests/test_codecs/test_crc32c.py index 3ab1070f60..941531100e 100644 --- a/tests/test_codecs/test_crc32c.py +++ b/tests/test_codecs/test_crc32c.py @@ -4,7 +4,7 @@ from zarr.abc.codec import SupportsSyncCodec from zarr.codecs.crc32c_ import Crc32cCodec -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArraySpec, ArraySpecConfig from zarr.core.buffer import default_buffer_prototype from zarr.core.dtype import get_data_type_from_native_dtype @@ -21,7 +21,7 @@ def test_crc32c_codec_sync_roundtrip() -> None: shape=arr.shape, dtype=zdtype, fill_value=zdtype.cast_scalar(0), - config=ArrayConfig(order="C", write_empty_chunks=True), + config=ArraySpecConfig(order="C", write_empty_chunks=True), prototype=default_buffer_prototype(), ) buf = default_buffer_prototype().buffer.from_array_like(arr.view("B")) diff --git a/tests/test_codecs/test_endian.py b/tests/test_codecs/test_endian.py index c505cee828..9d043c6790 100644 --- a/tests/test_codecs/test_endian.py +++ b/tests/test_codecs/test_endian.py @@ -7,7 +7,7 @@ from zarr.abc.codec import SupportsSyncCodec from zarr.abc.store import Store from zarr.codecs import BytesCodec -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArraySpec, ArraySpecConfig from zarr.core.buffer import NDBuffer, default_buffer_prototype from zarr.core.dtype import get_data_type_from_native_dtype from zarr.storage import StorePath @@ -49,7 +49,7 @@ def test_bytes_codec_sync_roundtrip() -> None: shape=arr.shape, dtype=zdtype, fill_value=zdtype.cast_scalar(0), - config=ArrayConfig(order="C", write_empty_chunks=True), + config=ArraySpecConfig(order="C", write_empty_chunks=True), prototype=default_buffer_prototype(), ) nd_buf: NDBuffer = default_buffer_prototype().nd_buffer.from_numpy_array(arr) diff --git a/tests/test_codecs/test_gzip.py b/tests/test_codecs/test_gzip.py index 8932ba5e59..feeb9f9949 100644 --- a/tests/test_codecs/test_gzip.py +++ b/tests/test_codecs/test_gzip.py @@ -5,7 +5,7 @@ from zarr.abc.codec import SupportsSyncCodec from zarr.abc.store import Store from zarr.codecs import GzipCodec -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArraySpec, ArraySpecConfig from zarr.core.buffer import default_buffer_prototype from zarr.core.dtype import get_data_type_from_native_dtype from zarr.storage import StorePath @@ -40,7 +40,7 @@ def test_gzip_codec_sync_roundtrip() -> None: shape=arr.shape, dtype=zdtype, fill_value=zdtype.cast_scalar(0), - config=ArrayConfig(order="C", write_empty_chunks=True), + config=ArraySpecConfig(order="C", write_empty_chunks=True), prototype=default_buffer_prototype(), ) buf = default_buffer_prototype().buffer.from_array_like(arr.view("B")) diff --git a/tests/test_codecs/test_transpose.py b/tests/test_codecs/test_transpose.py index 949bb72a62..16fe2e6bb5 100644 --- a/tests/test_codecs/test_transpose.py +++ b/tests/test_codecs/test_transpose.py @@ -6,7 +6,7 @@ from zarr.abc.codec import SupportsSyncCodec from zarr.abc.store import Store from zarr.codecs import TransposeCodec -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArraySpec, ArraySpecConfig from zarr.core.buffer import NDBuffer, default_buffer_prototype from zarr.core.common import MemoryOrder from zarr.core.dtype import get_data_type_from_native_dtype @@ -111,7 +111,7 @@ def test_transpose_codec_sync_roundtrip() -> None: shape=arr.shape, dtype=zdtype, fill_value=zdtype.cast_scalar(0), - config=ArrayConfig(order="C", write_empty_chunks=True), + config=ArraySpecConfig(order="C", write_empty_chunks=True), prototype=default_buffer_prototype(), ) nd_buf: NDBuffer = default_buffer_prototype().nd_buffer.from_numpy_array(arr) diff --git a/tests/test_codecs/test_zstd.py b/tests/test_codecs/test_zstd.py index 3f3f15a41a..199b77a941 100644 --- a/tests/test_codecs/test_zstd.py +++ b/tests/test_codecs/test_zstd.py @@ -5,7 +5,7 @@ from zarr.abc.codec import SupportsSyncCodec from zarr.abc.store import Store from zarr.codecs import ZstdCodec -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArraySpec, ArraySpecConfig from zarr.core.buffer import default_buffer_prototype from zarr.core.dtype import get_data_type_from_native_dtype from zarr.storage import StorePath @@ -41,7 +41,7 @@ def test_zstd_codec_sync_roundtrip() -> None: shape=arr.shape, dtype=zdtype, fill_value=zdtype.cast_scalar(0), - config=ArrayConfig(order="C", write_empty_chunks=True), + config=ArraySpecConfig(order="C", write_empty_chunks=True), prototype=default_buffer_prototype(), ) buf = default_buffer_prototype().buffer.from_array_like(arr.view("B")) diff --git a/tests/test_sync_codec_pipeline.py b/tests/test_sync_codec_pipeline.py index 1bfde7c837..4e0d9f4484 100644 --- a/tests/test_sync_codec_pipeline.py +++ b/tests/test_sync_codec_pipeline.py @@ -11,7 +11,7 @@ from zarr.codecs.gzip import GzipCodec from zarr.codecs.transpose import TransposeCodec from zarr.codecs.zstd import ZstdCodec -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArraySpec, ArraySpecConfig from zarr.core.buffer import Buffer, NDBuffer, default_buffer_prototype from zarr.core.codec_pipeline import ChunkTransform from zarr.core.dtype import get_data_type_from_native_dtype @@ -38,7 +38,7 @@ def _make_array_spec(shape: tuple[int, ...], dtype: np.dtype[np.generic]) -> Arr shape=shape, dtype=zdtype, fill_value=zdtype.cast_scalar(0), - config=ArrayConfig(order="C", write_empty_chunks=True), + config=ArraySpecConfig(order="C", write_empty_chunks=True), prototype=default_buffer_prototype(), ) From 94c9a8556bcd9ac01a74a52ac3c2c7912d74917a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 9 Apr 2026 11:07:13 +0200 Subject: [PATCH 03/10] fix: can now declare config in open_array --- src/zarr/api/asynchronous.py | 5 +++- src/zarr/api/synchronous.py | 4 +++ src/zarr/codecs/sharding.py | 11 ++++++-- src/zarr/core/array.py | 55 ++++++++++++++++++++++++++---------- src/zarr/core/array_spec.py | 32 ++++++++++----------- src/zarr/core/metadata/v3.py | 24 +++++++++------- tests/test_array.py | 35 ++++++++++++++++++----- 7 files changed, 115 insertions(+), 51 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index c776176665..2664cf9dea 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -1242,6 +1242,7 @@ async def open_array( zarr_format: ZarrFormat | None = None, path: PathLike = "", storage_options: dict[str, Any] | None = None, + config: ArrayConfigLike | None = None, **kwargs: Any, # TODO: type kwargs as valid args to save ) -> AnyAsyncArray: """Open an array using file-mode-like semantics. @@ -1261,6 +1262,8 @@ async def open_array( storage_options : dict If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. + config : ArrayConfigLike + Declaration of the runtime configuration for the array. **kwargs Any keyword arguments to pass to [`create`][zarr.api.asynchronous.create]. @@ -1279,7 +1282,7 @@ async def open_array( _warn_write_empty_chunks_kwarg() try: - return await AsyncArray.open(store_path, zarr_format=zarr_format) + return await AsyncArray.open(store_path, zarr_format=zarr_format, config=config) except FileNotFoundError as err: if not store_path.read_only and mode in _CREATE_MODES: overwrite = _infer_overwrite(mode) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index a865f97646..8f404d5eb6 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -1369,6 +1369,7 @@ def open_array( zarr_format: ZarrFormat | None = None, path: PathLike = "", storage_options: dict[str, Any] | None = None, + config: ArrayConfigLike | None = None, **kwargs: Any, ) -> AnyArray: """Open an array using file-mode-like semantics. @@ -1388,6 +1389,8 @@ def open_array( storage_options : dict If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. + config : ArrayConfigLike + Declaration of the runtime configuration for the array. **kwargs Any keyword arguments to pass to [`create`][zarr.api.asynchronous.create]. @@ -1405,6 +1408,7 @@ def open_array( zarr_format=zarr_format, path=path, storage_options=storage_options, + config=config, **kwargs, ) ) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index dee7af7583..03bbd0b89e 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -348,9 +348,16 @@ def __getstate__(self) -> dict[str, Any]: def __setstate__(self, state: dict[str, Any]) -> None: config = state["configuration"] + codec_class_map = parse_codec_class_map(None) object.__setattr__(self, "chunk_shape", parse_shapelike(config["chunk_shape"])) - object.__setattr__(self, "codecs", parse_codecs(config["codecs"])) - object.__setattr__(self, "index_codecs", parse_codecs(config["index_codecs"])) + object.__setattr__( + self, "codecs", parse_codecs(config["codecs"], codec_class_map=codec_class_map) + ) + object.__setattr__( + self, + "index_codecs", + parse_codecs(config["index_codecs"], codec_class_map=codec_class_map), + ) object.__setattr__(self, "index_location", parse_index_location(config["index_location"])) # Use instance-local lru_cache to avoid memory leaks diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 08ee52a068..dec4c23941 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -203,13 +203,13 @@ def _chunk_sizes_from_shape( return tuple(result) -def parse_array_metadata(data: object, config: ArrayConfig) -> ArrayMetadata: - if isinstance(data, ArrayMetadata): - return data.with_config(config) +def parse_array_metadata(data: object, codec_class_map: Mapping[str, type[Codec]]) -> ArrayMetadata: + if isinstance(data, ArrayV3Metadata): + return type(data).from_dict(data.to_dict(), codec_class_map=codec_class_map) elif isinstance(data, dict): zarr_format = data.get("zarr_format") if zarr_format == 3: - meta_out = ArrayV3Metadata.from_dict(data, config=config) + meta_out = ArrayV3Metadata.from_dict(data, codec_class_map=codec_class_map) if len(meta_out.storage_transformers) > 0: msg = ( f"Array metadata contains storage transformers: {meta_out.storage_transformers}." @@ -218,26 +218,37 @@ def parse_array_metadata(data: object, config: ArrayConfig) -> ArrayMetadata: raise ValueError(msg) return meta_out elif zarr_format == 2: - return ArrayV2Metadata.from_dict(data, config=config) + return ArrayV2Metadata.from_dict(data) else: raise ValueError(f"Invalid zarr_format: {zarr_format}. Expected 2 or 3") raise TypeError # pragma: no cover -def create_codec_pipeline(metadata: ArrayMetadata, *, store: Store | None = None) -> CodecPipeline: +def create_codec_pipeline( + metadata: ArrayMetadata, + *, + store: Store | None = None, + config: ArrayConfig | None = None, +) -> CodecPipeline: + pipeline_class: type[CodecPipeline] + if config is not None: + pipeline_class = config.codec_pipeline_class + else: + pipeline_class = get_pipeline_class() + if store is not None: try: - return get_pipeline_class().from_array_metadata_and_store( + return pipeline_class.from_array_metadata_and_store( array_metadata=metadata, store=store ) except NotImplementedError: pass if isinstance(metadata, ArrayV3Metadata): - return get_pipeline_class().from_codecs(metadata.codecs) + return pipeline_class.from_codecs(metadata.codecs) elif isinstance(metadata, ArrayV2Metadata): v2_codec = V2Codec(filters=metadata.filters, compressor=metadata.compressor) - return get_pipeline_class().from_codecs([v2_codec]) + return pipeline_class.from_codecs([v2_codec]) raise TypeError # pragma: no cover @@ -360,7 +371,9 @@ def __init__( config: ArrayConfigLike | None = None, ) -> None: config_parsed = parse_array_config(config) - metadata_parsed = parse_array_metadata(metadata, config=config_parsed) + metadata_parsed = parse_array_metadata( + metadata, codec_class_map=config_parsed.codec_class_map + ) object.__setattr__(self, "metadata", metadata_parsed) object.__setattr__(self, "store_path", store_path) @@ -369,7 +382,9 @@ def __init__( object.__setattr__( self, "codec_pipeline", - create_codec_pipeline(metadata=metadata_parsed, store=store_path.store), + create_codec_pipeline( + metadata=metadata_parsed, store=store_path.store, config=config_parsed + ), ) # this overload defines the function signature when zarr_format is 2 @@ -785,6 +800,7 @@ def _create_metadata_v3( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: DimensionNamesLike = None, attributes: dict[str, JSON] | None = None, + codec_class_map: Mapping[str, type[Codec]] | None = None, ) -> ArrayV3Metadata: """Create an instance of ArrayV3Metadata.""" filters: tuple[ArrayArrayCodec, ...] @@ -822,6 +838,7 @@ def _create_metadata_v3( codecs=codecs_parsed, # type: ignore[arg-type] dimension_names=tuple(dimension_names) if dimension_names else None, attributes=attributes or {}, + codec_class_map=codec_class_map, ) @classmethod @@ -869,6 +886,7 @@ async def _create_v3( codecs=codecs, dimension_names=dimension_names, attributes=attributes, + codec_class_map=config.codec_class_map, ) array = cls(metadata=metadata, store_path=store_path, config=config) @@ -993,7 +1011,9 @@ def from_dict( ValueError If the dictionary data is invalid or incompatible with either Zarr format 2 or 3 array creation. """ - metadata = parse_array_metadata(data) + from zarr.core.array_spec import parse_codec_class_map + + metadata = parse_array_metadata(data, codec_class_map=parse_codec_class_map(None)) return cls(metadata=metadata, store_path=store_path) @classmethod @@ -1001,6 +1021,8 @@ async def open( cls, store: StoreLike, zarr_format: ZarrFormat | None = 3, + *, + config: ArrayConfigLike | None = None, ) -> AnyAsyncArray: """ Async method to open an existing Zarr array from a given store. @@ -1013,6 +1035,8 @@ async def open( for a description of all valid StoreLike values. zarr_format : ZarrFormat | None, optional The Zarr format version (default is 3). + config : ArrayConfigLike | None, (default is None) + Runtime configuration for the array. Returns ------- @@ -1044,7 +1068,7 @@ async def example(): metadata_dict = await get_array_metadata(store_path, zarr_format=zarr_format) # TODO: remove this cast when we have better type hints _metadata_dict = cast("ArrayMetadataJSON_V3", metadata_dict) - return cls(store_path=store_path, metadata=_metadata_dict) + return cls(store_path=store_path, metadata=_metadata_dict, config=config) @property def store(self) -> Store: @@ -4710,7 +4734,7 @@ async def init_array( chunk_key_encoding: ChunkKeyEncodingLike | None = None, dimension_names: DimensionNamesLike = None, overwrite: bool = False, - config: ArrayConfigLike | None = None, + config: ArrayConfig | None = None, ) -> AnyAsyncArray: """Create and persist an array metadata document. @@ -4948,6 +4972,7 @@ async def init_array( codecs=codecs_out, dimension_names=dimension_names, attributes=attributes, + codec_class_map=config.codec_class_map if config is not None else None, ) arr = AsyncArray(metadata=meta, store_path=store_path, config=config) @@ -5145,7 +5170,7 @@ async def create_array( chunk_key_encoding=chunk_key_encoding, dimension_names=dimension_names, overwrite=overwrite, - config=config, + config=parse_array_config(config), ) diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index 5617a24d01..a27f8bd2cc 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -3,7 +3,7 @@ from dataclasses import dataclass, fields from typing import TYPE_CHECKING, Any, Final, Literal, Self, TypedDict, cast -from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec, CodecPipeline from zarr.core.common import ( MemoryOrder, parse_bool, @@ -30,7 +30,7 @@ class CodecPipelineRequest(TypedDict): options: NotRequired[dict[str, object]] -class ArrayConfigParams(TypedDict): +class ArrayConfigParams(TypedDict, closed=True): # type: ignore[call-arg] """ A TypedDict model of the attributes of an ArrayConfig class. """ @@ -38,11 +38,11 @@ class ArrayConfigParams(TypedDict): order: MemoryOrder write_empty_chunks: bool read_missing_chunks: bool - codec_class_map: Mapping[str, object] - codec_pipeline_class: CodecPipelineRequest + codec_class_map: Mapping[str, type[ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec]] + codec_pipeline_class: type[CodecPipeline] -class ArrayConfigRequest(TypedDict): +class ArrayConfigRequest(TypedDict, closed=True): # type: ignore[call-arg] """ A TypedDict model of the attributes of an ArrayConfig class, but with no required fields. This allows for partial construction of an ArrayConfig, with the assumption that the unset @@ -55,7 +55,7 @@ class ArrayConfigRequest(TypedDict): codec_class_map: NotRequired[ Mapping[str, type[ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec]] ] - codec_pipeline_class: NotRequired[CodecPipelineRequest] + codec_pipeline_class: NotRequired[type[CodecPipeline]] ArrayConfigKeys = Literal[ @@ -112,8 +112,8 @@ class ArrayConfig: codec_class_map : Mapping[str, object] A codec name : codec class mapping that defines the codec classes available for array creation. - codec_pipeline_class : CodecPipelineRequest - A request for a pipeline class that will be used for orchestrating chunk encoding and + codec_pipeline_class : type[CodecPipeline] + A codec pipeline class that will be used for orchestrating chunk encoding and decoding. """ @@ -121,7 +121,7 @@ class ArrayConfig: write_empty_chunks: bool read_missing_chunks: bool codec_class_map: Mapping[str, type[Codec]] - codec_pipeline_class: CodecPipelineRequest + codec_pipeline_class: type[CodecPipeline] def __init__( self, @@ -131,7 +131,7 @@ def __init__( read_missing_chunks: bool = True, codec_class_map: Mapping[str, type[ArrayBytesCodec | ArrayArrayCodec | BytesBytesCodec]] | None = None, - codec_pipeline_class: CodecPipelineRequest | None = None, + codec_pipeline_class: type[CodecPipeline] | None = None, ) -> None: order_parsed = parse_order(order) write_empty_chunks_parsed = parse_bool(write_empty_chunks) @@ -213,19 +213,16 @@ def _import_by_name(path: str) -> object | type: return obj -def parse_codec_pipeline_class(obj: CodecPipelineRequest | None) -> CodecPipelineRequest: +def parse_codec_pipeline_class(obj: type[CodecPipeline] | None) -> type[CodecPipeline]: if obj is None: - config_entry: dict[str, str | int] = zarr_config.get("codec_pipeline") + config_entry: dict[str, str] = zarr_config.get("codec_pipeline") if "path" not in config_entry: msg = ( "The codec_pipeline field in the global config is malformed. " "Expected 'path' key was not found." ) raise KeyError(msg) - else: - path = config_entry["path"] - options = {"batch_size": config_entry.get("batch_size", 1)} - return {"class_path": path, "options": options} + return _import_by_name(config_entry["path"]) # type: ignore[return-value] return obj @@ -241,6 +238,9 @@ def parse_codec_class_map(obj: Mapping[str, type[Codec]] | None) -> Mapping[str, out: dict[str, type[Codec]] = {} for key, value in name_map.items(): maybe_cls = _import_by_name(value) + if not isinstance(maybe_cls, type): + msg = f"Expected a type, got {maybe_cls}" + raise TypeError(msg) if not issubclass(maybe_cls, Codec): msg = f"Expected a subclass of `Codec`, got {maybe_cls}" raise TypeError(msg) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index aea04359b7..c370680b41 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -459,14 +459,14 @@ def __init__( dimension_names: DimensionNamesLike, storage_transformers: Iterable[dict[str, JSON]] | None = None, extra_fields: Mapping[str, AllowedExtraField] | None = None, - _config: ArrayConfig | None = None, + codec_class_map: Mapping[str, type[Codec]] | None = None, ) -> None: """ Because the class is a frozen dataclass, we set attributes using object.__setattr__ """ - # generate a new default config if _config is `None` - if _config is None: - _config = ArrayConfig.from_dict({}) + from zarr.core.array_spec import parse_codec_class_map + + codec_class_map_parsed = parse_codec_class_map(codec_class_map) shape_parsed = parse_shapelike(shape) chunk_grid_parsed = parse_chunk_grid(chunk_grid) chunk_key_encoding_parsed = parse_chunk_key_encoding(chunk_key_encoding) @@ -474,7 +474,7 @@ def __init__( # Note: relying on a type method is numpy-specific fill_value_parsed = data_type.cast_scalar(fill_value) attributes_parsed = parse_attributes(attributes) - codecs_parsed_partial = parse_codecs(codecs, _config.codec_class_map) + codecs_parsed_partial = parse_codecs(codecs, codec_class_map_parsed) storage_transformers_parsed = parse_storage_transformers(storage_transformers) extra_fields_parsed = parse_extra_fields(extra_fields) array_spec = ArraySpec( @@ -482,7 +482,7 @@ def __init__( dtype=data_type, fill_value=fill_value_parsed, config=ArraySpecConfig( - write_empty_chunks=_config.write_empty_chunks, order=_config.order + write_empty_chunks=False, order="C" ), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. ) @@ -499,7 +499,6 @@ def __init__( object.__setattr__(self, "attributes", attributes_parsed) object.__setattr__(self, "storage_transformers", storage_transformers_parsed) object.__setattr__(self, "extra_fields", extra_fields_parsed) - object.__setattr__(self, "_config", _config) self._validate_metadata() @@ -578,7 +577,9 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: } @classmethod - def from_dict(cls, data: dict[str, JSON], *, config: ArrayConfig | None = None) -> Self: + def from_dict( + cls, data: dict[str, JSON], *, codec_class_map: Mapping[str, type[Codec]] | None = None + ) -> Self: # make a copy because we are modifying the dict _data = data.copy() @@ -631,7 +632,7 @@ def from_dict(cls, data: dict[str, JSON], *, config: ArrayConfig | None = None) data_type=data_type, extra_fields=allowed_extra_fields, storage_transformers=_data_typed.get("storage_transformers", ()), # type: ignore[arg-type] - _config=config, + codec_class_map=codec_class_map, ) def to_dict(self) -> dict[str, JSON]: @@ -674,4 +675,7 @@ def with_config(self, config: ArrayConfig | None) -> Self: """ Return a copy of this metadata with a new configuration object. """ - return type(self).from_dict(self.to_dict(), config=config) + return type(self).from_dict( + self.to_dict(), + codec_class_map=config.codec_class_map if config is not None else None, + ) diff --git a/tests/test_array.py b/tests/test_array.py index 7bbb9370fe..4c412e1151 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -44,10 +44,11 @@ default_filters_v2, default_serializer_v3, ) -from zarr.core.array_spec import ArrayConfig, ArrayConfigRequest +from zarr.core.array_spec import ArrayConfig, ArrayConfigParams, ArrayConfigRequest from zarr.core.buffer import NDArrayLike, NDArrayLikeOrScalar, default_buffer_prototype from zarr.core.chunk_grids import _auto_partition from zarr.core.chunk_key_encodings import ChunkKeyEncodingParams +from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.common import JSON, ZarrFormat, ceildiv from zarr.core.dtype import ( DateTime64, @@ -2314,11 +2315,11 @@ def test_with_config_polymorphism() -> None: objects. """ source_config: ArrayConfig = ArrayConfig.from_dict({"write_empty_chunks": False, "order": "F"}) - source_config_dict = source_config.to_dict() + source_config_dict: ArrayConfigParams = source_config.to_dict() arr = zarr.create_array({}, shape=(1,), dtype="uint8") arr_source_config = arr.with_config(source_config) - arr_source_config_dict = arr.with_config(source_config_dict) + arr_source_config_dict = arr.with_config(source_config_dict) # type: ignore[arg-type] assert arr_source_config.config == arr_source_config_dict.config @@ -2330,9 +2331,29 @@ def test_array_config_specify_codecs() -> None: class FakeGzipCodec(GzipCodec): ... - store = {} + store = {} # type: ignore[var-annotated] arr = zarr.create_array(store, shape=(1,), dtype="uint8", compressors=GzipCodec()) - arr_2 = arr.with_config( - {"codec_class_map": {**arr.config.codec_class_map, "gzip": FakeGzipCodec}} - ) + new_config: ArrayConfigRequest = { + "codec_class_map": {**arr.config.codec_class_map, "gzip": FakeGzipCodec} + } + arr_2 = arr.with_config(new_config) assert isinstance(arr_2.compressors[0], FakeGzipCodec) + + arr_3 = zarr.open_array(store=store, config=new_config) + assert isinstance(arr_3.compressors[0], FakeGzipCodec) + + +def test_aray_config_specify_codecpipeline() -> None: + """ + Test that we can use the array configuration to open an array with a different codec pipeline + """ + store = {} # type: ignore[var-annotated] + + class FakeCodecPipeline(BatchedCodecPipeline): ... + + arr = zarr.create_array( + store, shape=(1,), dtype="uint8", config={"codec_pipeline_class": FakeCodecPipeline} + ) + assert isinstance(arr.async_array.codec_pipeline, FakeCodecPipeline) + arr_2 = arr.with_config({"codec_pipeline_class": BatchedCodecPipeline}) + assert isinstance(arr_2.async_array.codec_pipeline, BatchedCodecPipeline) From b9b84110e3c65e71198949f8d33bc914d548afe4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 9 Apr 2026 11:17:05 +0200 Subject: [PATCH 04/10] use typing_extensions for typeddict --- src/zarr/core/array_spec.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index a27f8bd2cc..fa90314b70 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -1,7 +1,9 @@ from __future__ import annotations from dataclasses import dataclass, fields -from typing import TYPE_CHECKING, Any, Final, Literal, Self, TypedDict, cast +from typing import TYPE_CHECKING, Any, Final, Literal, Self, cast + +from typing_extensions import TypedDict from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec, CodecPipeline from zarr.core.common import ( From f06e3dcf1f02b30f06d9e1d048f2c13938d1c9ed Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 9 Apr 2026 11:19:57 +0200 Subject: [PATCH 05/10] fix: handle arrayv2metadata --- src/zarr/core/array.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index dec4c23941..6754cdf162 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -206,6 +206,8 @@ def _chunk_sizes_from_shape( def parse_array_metadata(data: object, codec_class_map: Mapping[str, type[Codec]]) -> ArrayMetadata: if isinstance(data, ArrayV3Metadata): return type(data).from_dict(data.to_dict(), codec_class_map=codec_class_map) + elif isinstance(data, ArrayV2Metadata): + return data elif isinstance(data, dict): zarr_format = data.get("zarr_format") if zarr_format == 3: From 7d2383e0a4034766be2c94cfcae4643eca3293e7 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 9 Apr 2026 11:25:18 +0200 Subject: [PATCH 06/10] fix: add note --- src/zarr/core/array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 6754cdf162..7a44dd6020 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -207,6 +207,7 @@ def parse_array_metadata(data: object, codec_class_map: Mapping[str, type[Codec] if isinstance(data, ArrayV3Metadata): return type(data).from_dict(data.to_dict(), codec_class_map=codec_class_map) elif isinstance(data, ArrayV2Metadata): + # V2 arrays get their codecs from numcodecs, for now. the codec class map is not used. return data elif isinstance(data, dict): zarr_format = data.get("zarr_format") From 66d0476e3312db16a7115c6d1c6c82dbf46853d6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 9 Apr 2026 11:28:24 +0200 Subject: [PATCH 07/10] fix: fix test_parse_codecs_unknown_codec --- tests/test_metadata/test_v3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 01ed921053..e59fdcbf14 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -345,7 +345,7 @@ def test_parse_codecs_unknown_codec_raises(monkeypatch: pytest.MonkeyPatch) -> N codecs = [{"name": "unknown"}] with pytest.raises(UnknownCodecError): - parse_codecs(codecs) + parse_codecs(codecs, codec_class_map={}) @pytest.mark.parametrize( From 99967f354bf48e38756a8a19ac40241483419377 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 9 Apr 2026 11:31:02 +0200 Subject: [PATCH 08/10] docs: changelog --- changes/3892.feature.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/3892.feature.md diff --git a/changes/3892.feature.md b/changes/3892.feature.md new file mode 100644 index 0000000000..a602602209 --- /dev/null +++ b/changes/3892.feature.md @@ -0,0 +1 @@ +Add `codec_class_map` and `codec_pipeline_class` fields to the runtime array configuration. This allows explicitly declaring the codec classes and codec pipeline class to use when reading an array, as well as dynamically swapping out the codec classes or the codec pipeline class on an existing `zarr.Array`. \ No newline at end of file From 3be1eb6ad2a4595201812fd608ac3ec981847b3e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 9 Apr 2026 13:17:32 +0200 Subject: [PATCH 09/10] fix: resolve array parsing bug --- src/zarr/core/array.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 7a44dd6020..a01b21eb5f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -128,6 +128,7 @@ ChunkGridMetadata, RectilinearChunkGridMetadata, RegularChunkGridMetadata, + parse_codecs, parse_node_type_array, resolve_chunks, ) @@ -205,7 +206,10 @@ def _chunk_sizes_from_shape( def parse_array_metadata(data: object, codec_class_map: Mapping[str, type[Codec]]) -> ArrayMetadata: if isinstance(data, ArrayV3Metadata): - return type(data).from_dict(data.to_dict(), codec_class_map=codec_class_map) + new_codecs = parse_codecs( + [c.to_dict() for c in data.codecs], codec_class_map=codec_class_map + ) + return replace(data, codecs=new_codecs) elif isinstance(data, ArrayV2Metadata): # V2 arrays get their codecs from numcodecs, for now. the codec class map is not used. return data From 741e99e90e8287cec413e9572713e612562523e0 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 9 Apr 2026 13:27:42 +0200 Subject: [PATCH 10/10] fix: never use string-based imports --- src/zarr/core/array_spec.py | 65 ++++++++----------------------------- 1 file changed, 14 insertions(+), 51 deletions(-) diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index fa90314b70..a187356b5c 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -5,7 +5,6 @@ from typing_extensions import TypedDict -from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec, CodecPipeline from zarr.core.common import ( MemoryOrder, parse_bool, @@ -19,6 +18,13 @@ from collections.abc import Mapping from typing import NotRequired + from zarr.abc.codec import ( + ArrayArrayCodec, + ArrayBytesCodec, + BytesBytesCodec, + Codec, + CodecPipeline, + ) from zarr.core.buffer import BufferPrototype from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @@ -187,67 +193,24 @@ def to_dict(self) -> ArrayConfigParams: ArrayConfigLike = ArrayConfig | ArrayConfigRequest -def _import_by_name(path: str) -> object | type: - """ - Import an object by its fully qualified name. - """ - import importlib - - parts = path.split(".") - - # Try progressively shorter module paths - for i in range(len(parts), 0, -1): - module_path = ".".join(parts[:i]) - try: - module = importlib.import_module(module_path) - break - except ModuleNotFoundError: - continue - else: - raise ImportError(f"Could not import any module from '{path}'") - - obj = module - for attr in parts[i:]: - try: - obj = getattr(obj, attr) - except AttributeError as e: - raise ImportError(f"Attribute '{attr}' not found in '{obj}'") from e - return obj - - def parse_codec_pipeline_class(obj: type[CodecPipeline] | None) -> type[CodecPipeline]: if obj is None: - config_entry: dict[str, str] = zarr_config.get("codec_pipeline") - if "path" not in config_entry: - msg = ( - "The codec_pipeline field in the global config is malformed. " - "Expected 'path' key was not found." - ) - raise KeyError(msg) - return _import_by_name(config_entry["path"]) # type: ignore[return-value] + from zarr.registry import get_pipeline_class + + return get_pipeline_class() return obj def parse_codec_class_map(obj: Mapping[str, type[Codec]] | None) -> Mapping[str, type[Codec]]: """ Convert a request for a codec class map into an actual Mapping[str, type[Codec]]. - If the input is `None`, then we look up the list of codecs from the registry, where they - are stored as fully qualified class names. We must resolve these names to concrete classes - before inserting them into the returned mapping. + If the input is `None`, build the map from the codec registry. """ if obj is None: + from zarr.registry import get_codec_class + name_map: dict[str, str] = zarr_config.get("codecs", {}) - out: dict[str, type[Codec]] = {} - for key, value in name_map.items(): - maybe_cls = _import_by_name(value) - if not isinstance(maybe_cls, type): - msg = f"Expected a type, got {maybe_cls}" - raise TypeError(msg) - if not issubclass(maybe_cls, Codec): - msg = f"Expected a subclass of `Codec`, got {maybe_cls}" - raise TypeError(msg) - out[key] = maybe_cls - return out + return {key: get_codec_class(key) for key in name_map} return obj