From 0548ca42d4f47baeb1a8d9e60a87932deb5db736 Mon Sep 17 00:00:00 2001 From: Remco Leijenaar <55834815+RFLeijenaar@users.noreply.github.com> Date: Sat, 20 Sep 2025 19:57:46 +0200 Subject: [PATCH 1/2] Add fanout chunk key encoding --- chunk-key-encodings/fanout/README.md | 65 ++++++++++++++++++++++++++ chunk-key-encodings/fanout/schema.json | 23 +++++++++ 2 files changed, 88 insertions(+) create mode 100644 chunk-key-encodings/fanout/README.md create mode 100644 chunk-key-encodings/fanout/schema.json diff --git a/chunk-key-encodings/fanout/README.md b/chunk-key-encodings/fanout/README.md new file mode 100644 index 0000000..dd30227 --- /dev/null +++ b/chunk-key-encodings/fanout/README.md @@ -0,0 +1,65 @@ +# Fanout chunk key encoding +Defines a chunk key encoding that converts chunk coordinates into a `/`-separated path (representing a sequence of nodes in a tree hierarchy), by splitting each coordinate into multiple nodes such that no node in the hierarchy exceeds a predefined maximum number of children (i.e., fanout). This is useful for filesystems or other hierarchical stores that experience performance issues when nodes (e.g., directories) contain many entries. + +## Chunk key encoding name + +The value of the `name` member in the chunk key encoding object MUST be `fanout`. + +## Configuration parameters + +### `max_children` + +An integer greater than 3 indicating the maximum number of child entries allowed within a single node (e.g., directory). Defaults to 1001 if omitted. + +## Example + +For example, the array metadata below specifies that chunk keys are encoded using the `fanout` strategy with a maximum of 1001 files per directory: + +```json +{ + "chunk_key_encoding": { + "name": "fanout", + "configuration": { + "max_children": 1001 + } + } +} +```` + +## Algorithm +Given chunk coordinates as a tuple of integers and a parameter `max_children`, the chunk key is constructed as follows: + +1. For each coordinate `coord` at dimension index `dim` (indexing starts from `0`): + + 1. Create a dimension marker `d{dim}`. + 2. Express `coord` in base `max_children - 1`, producing a sequence of digits (most significant first). + 3. Join the digits with `/` and prepend the dimension marker to form a subpath. For example: + + ``` + d{dim}/{digit0}/{digit1}/…/{digitN} + ``` + +2. Concatenate all dimension subpaths (in order from the lowest to highest dimension) using `/` as a separator. + +3. Append `"/c"` at the end to indicate the chunk file itself. + +> **Note:** Because nodes may also contain reserved entries such as the dimensional markers `dN` and the final chunk marker `c`, the effective numeric base used to subdivide coordinates is `max_children - 1`. + +> **Note:** This method ensures that no directory contains more than `max_children` child entries. Existing chunks never need to be moved or reorganized to maintain this property when new chunks are added. + +### Example +With `max_children = 101` (effective base = 100): + +| Coordinates | Chunk key | +| ------------------ | ---------------------------- | +| `()` | `c` | +| `(123,)` | `d0/1/23/c` | +| `(1234, 5, 67890)` | `d0/12/34/d1/5/d2/6/78/90/c` | + +## Change log + +No changes yet. + +## Current maintainers + +* Remco Leijenaar (GitHub: [RFLeijenaar](https://github.com/RFLeijenaar)) \ No newline at end of file diff --git a/chunk-key-encodings/fanout/schema.json b/chunk-key-encodings/fanout/schema.json new file mode 100644 index 0000000..68b02d5 --- /dev/null +++ b/chunk-key-encodings/fanout/schema.json @@ -0,0 +1,23 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "name": { + "type": "string", + "const": "fanout" + }, + "configuration": { + "type": "object", + "properties": { + "max_children": { + "type": "integer", + "minimum": 3, + "default": 1001 + } + }, + "additionalProperties": false + } + }, + "required": ["name"], + "additionalProperties": false +} From 7176c0fe28befe40398c659f1d33d9e7191a4514 Mon Sep 17 00:00:00 2001 From: Remco Leijenaar <55834815+RFLeijenaar@users.noreply.github.com> Date: Sat, 20 Sep 2025 20:02:02 +0200 Subject: [PATCH 2/2] Update to algorithm description --- chunk-key-encodings/fanout/README.md | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/chunk-key-encodings/fanout/README.md b/chunk-key-encodings/fanout/README.md index dd30227..4657cf7 100644 --- a/chunk-key-encodings/fanout/README.md +++ b/chunk-key-encodings/fanout/README.md @@ -1,5 +1,5 @@ # Fanout chunk key encoding -Defines a chunk key encoding that converts chunk coordinates into a `/`-separated path (representing a sequence of nodes in a tree hierarchy), by splitting each coordinate into multiple nodes such that no node in the hierarchy exceeds a predefined maximum number of children (i.e., fanout). This is useful for filesystems or other hierarchical stores that experience performance issues when nodes (e.g., directories) contain many entries. +Defines a chunk key encoding that converts chunk coordinates into a `/`-separated path (representing a sequence of nodes in a tree hierarchy), by splitting each coordinate into multiple nodes such that no node in the hierarchy exceeds a predefined maximum number of children. This is useful for filesystems or other hierarchical stores that experience performance issues when nodes (e.g., directories) contain many entries. ## Chunk key encoding name @@ -9,7 +9,7 @@ The value of the `name` member in the chunk key encoding object MUST be `fanout` ### `max_children` -An integer greater than 3 indicating the maximum number of child entries allowed within a single node (e.g., directory). Defaults to 1001 if omitted. +An integer greater than or equal to 3 indicating the maximum number of child entries (fanout) allowed within a single node. Defaults to 1001 if omitted. ## Example @@ -24,20 +24,18 @@ For example, the array metadata below specifies that chunk keys are encoded usin } } } -```` +``` ## Algorithm Given chunk coordinates as a tuple of integers and a parameter `max_children`, the chunk key is constructed as follows: 1. For each coordinate `coord` at dimension index `dim` (indexing starts from `0`): - 1. Create a dimension marker `d{dim}`. 2. Express `coord` in base `max_children - 1`, producing a sequence of digits (most significant first). - 3. Join the digits with `/` and prepend the dimension marker to form a subpath. For example: - - ``` - d{dim}/{digit0}/{digit1}/…/{digitN} - ``` + 3. Join the digits with `/` and prepend the dimension marker to form a subpath. This creates a subpath of the form: + ``` + d{dim}/{digit0}/{digit1}/.../{digitN} + ``` 2. Concatenate all dimension subpaths (in order from the lowest to highest dimension) using `/` as a separator.