Skip to content

Commit 89e84c8

Browse files
yingjianwu98stevie9868
andauthored
feat: should dictionary encode based on size (#4972)
This PR enables check on dictionaryEncoded based on size related to this issue #4898 **Dictionary encoding splits data into two parts:** 1. Dictionary: stores unique values 2. Indices: maps each value to a dictionary entry **For FixedWidth:** 1. Dictionary: cardinality × 16 bytes (128 bits per value) 2. Indices: num_values × 4 bytes (32-bit i32) **For VariableWidth (strings/binary):** 1. Dictionary values: cardinality × avg_value_size (actual data) 2. Dictionary offsets: cardinality × offset_size (32 or 64 bits) 4. Indices: num_values × offset_size (same as dictionary offsets) Co-authored-by: stevie9868 <yingjianwu2@email.com>
1 parent 846effe commit 89e84c8

File tree

4 files changed

+237
-32
lines changed

4 files changed

+237
-32
lines changed

rust/lance-encoding/src/constants.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@ pub const RLE_THRESHOLD_META_KEY: &str = "lance-encoding:rle-threshold";
1919
/// Set to a large value to discourage dictionary encoding
2020
/// Set to a small value to encourage dictionary encoding
2121
pub const DICT_DIVISOR_META_KEY: &str = "lance-encoding:dict-divisor";
22+
/// Metadata key for dictionary encoding size ratio threshold (0.0-1.0]
23+
/// If estimated_dict_size/raw_size < ratio, use dictionary encoding.
24+
/// Example: 0.8 means use dict if encoded size < 80% of raw size
25+
/// Default: 0.8
26+
pub const DICT_SIZE_RATIO_META_KEY: &str = "lance-encoding:dict-size-ratio";
2227

2328
// NOTE: BLOB_META_KEY is defined in lance-core to avoid circular dependency
2429

rust/lance-encoding/src/encodings/logical/primitive.rs

Lines changed: 222 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@ use std::{
1414

1515
use crate::{
1616
constants::{
17-
DICT_DIVISOR_META_KEY, STRUCTURAL_ENCODING_FULLZIP, STRUCTURAL_ENCODING_META_KEY,
18-
STRUCTURAL_ENCODING_MINIBLOCK,
17+
STRUCTURAL_ENCODING_FULLZIP, STRUCTURAL_ENCODING_META_KEY, STRUCTURAL_ENCODING_MINIBLOCK,
1918
},
2019
data::DictionaryDataBlock,
2120
encodings::logical::primitive::blob::{BlobDescriptionPageScheduler, BlobPageScheduler},
@@ -65,6 +64,10 @@ use crate::{
6564
};
6665
use lance_core::{datatypes::Field, utils::tokio::spawn_cpu, Result};
6766

67+
use crate::constants::DICT_SIZE_RATIO_META_KEY;
68+
use crate::encodings::logical::primitive::dict::{
69+
DICT_FIXED_WIDTH_BITS_PER_VALUE, DICT_INDICES_BITS_PER_VALUE,
70+
};
6871
use crate::{
6972
buffer::LanceBuffer,
7073
data::{BlockInfo, DataBlockBuilder, FixedWidthDataBlock},
@@ -4117,7 +4120,70 @@ impl PrimitiveStructuralEncoder {
41174120
})
41184121
}
41194122

4123+
/// Estimates the total size of dictionary-encoded data
4124+
///
4125+
/// Dictionary encoding splits data into two parts:
4126+
/// 1. Dictionary: stores unique values
4127+
/// 2. Indices: maps each value to a dictionary entry
4128+
///
4129+
/// For FixedWidth (e.g., 128-bit Decimal):
4130+
/// - Dictionary: cardinality × 16 bytes (128 bits per value)
4131+
/// - Indices: num_values × 4 bytes (32-bit i32)
4132+
///
4133+
/// For VariableWidth (strings/binary):
4134+
/// - Dictionary values: cardinality × avg_value_size (actual data)
4135+
/// - Dictionary offsets: cardinality × offset_size (32 or 64 bits)
4136+
/// - Indices: num_values × offset_size (same as dictionary offsets)
4137+
fn estimate_dict_size(data_block: &DataBlock) -> Option<u64> {
4138+
let cardinality = if let Some(cardinality_array) = data_block.get_stat(Stat::Cardinality) {
4139+
cardinality_array.as_primitive::<UInt64Type>().value(0)
4140+
} else {
4141+
return None;
4142+
};
4143+
4144+
let num_values = data_block.num_values();
4145+
4146+
match data_block {
4147+
DataBlock::FixedWidth(_) => {
4148+
// Dictionary: cardinality unique values at 128 bits each
4149+
let dict_size = cardinality * (DICT_FIXED_WIDTH_BITS_PER_VALUE / 8);
4150+
// Indices: num_values indices at 32 bits each
4151+
let indices_size = num_values * (DICT_INDICES_BITS_PER_VALUE / 8);
4152+
Some(dict_size + indices_size)
4153+
}
4154+
DataBlock::VariableWidth(var) => {
4155+
// Only 32-bit and 64-bit offsets are supported
4156+
if var.bits_per_offset != 32 && var.bits_per_offset != 64 {
4157+
return None;
4158+
}
4159+
let bits_per_offset = var.bits_per_offset as u64;
4160+
4161+
let data_size = data_block.data_size();
4162+
let avg_value_size = data_size / num_values;
4163+
4164+
// Dictionary values: actual bytes of unique strings/binary
4165+
let dict_values_size = cardinality * avg_value_size;
4166+
// Dictionary offsets: pointers into dictionary values
4167+
let dict_offsets_size = cardinality * (bits_per_offset / 8);
4168+
// Indices: map each row to dictionary entry
4169+
let indices_size = num_values * (bits_per_offset / 8);
4170+
4171+
Some(dict_values_size + dict_offsets_size + indices_size)
4172+
}
4173+
_ => None,
4174+
}
4175+
}
4176+
41204177
fn should_dictionary_encode(data_block: &DataBlock, field: &Field) -> bool {
4178+
// Since we only dictionary encode FixedWidth and VariableWidth blocks for now, we skip
4179+
// estimating the size
4180+
if !matches!(
4181+
data_block,
4182+
DataBlock::FixedWidth(_) | DataBlock::VariableWidth(_)
4183+
) {
4184+
return false;
4185+
}
4186+
41214187
// Don't dictionary encode tiny arrays
41224188
let too_small = env::var("LANCE_ENCODING_DICT_TOO_SMALL")
41234189
.ok()
@@ -4127,35 +4193,40 @@ impl PrimitiveStructuralEncoder {
41274193
return false;
41284194
}
41294195

4130-
// Somewhat arbitrary threshold rule. Apply dictionary encoding if the number of unique
4131-
// values is less than 1/2 the total number of values.
4132-
let divisor: u64 = field
4196+
// Get size ratio from metadata or env var, default to 0.8
4197+
let threshold_ratio = field
41334198
.metadata
4134-
.get(DICT_DIVISOR_META_KEY)
4135-
.map(|val| val.parse().ok())
4136-
.unwrap_or_else(|| {
4137-
env::var("LANCE_ENCODING_DICT_DIVISOR")
4199+
.get(DICT_SIZE_RATIO_META_KEY)
4200+
.and_then(|val| val.parse::<f64>().ok())
4201+
.or_else(|| {
4202+
env::var("LANCE_ENCODING_DICT_SIZE_RATIO")
41384203
.ok()
41394204
.and_then(|val| val.parse().ok())
41404205
})
4141-
.unwrap_or(2);
4206+
.unwrap_or(0.8);
41424207

4143-
// Cap on cardinality. This should be pushed into the cardinality estimation to avoid
4144-
// spending too much time estimating cardinality.
4145-
let max_cardinality = env::var("LANCE_ENCODING_DICT_MAX_CARDINALITY")
4146-
.ok()
4147-
.and_then(|val| val.parse().ok())
4148-
.unwrap_or(100000);
4208+
// Validate size ratio is in valid range
4209+
if threshold_ratio <= 0.0 || threshold_ratio > 1.0 {
4210+
panic!(
4211+
"Invalid parameter: dict-size-ratio is {} which is not in the range (0, 1].",
4212+
threshold_ratio
4213+
);
4214+
}
41494215

4150-
let threshold = (data_block.num_values() / divisor).min(max_cardinality);
4216+
// Get raw data size
4217+
let data_size = data_block.data_size();
41514218

4152-
let cardinality = if let Some(cardinality_array) = data_block.get_stat(Stat::Cardinality) {
4153-
cardinality_array.as_primitive::<UInt64Type>().value(0)
4154-
} else {
4155-
u64::MAX
4219+
// Estimate dictionary-encoded size
4220+
let Some(encoded_size) = Self::estimate_dict_size(data_block) else {
4221+
return false;
41564222
};
41574223

4158-
cardinality < threshold
4224+
let size_ratio_actual = if data_size > 0 {
4225+
encoded_size as f64 / data_size as f64
4226+
} else {
4227+
return false;
4228+
};
4229+
size_ratio_actual < threshold_ratio
41594230
}
41604231

41614232
// Creates an encode task, consuming all buffered data
@@ -4400,7 +4471,14 @@ impl FieldEncoder for PrimitiveStructuralEncoder {
44004471
#[cfg(test)]
44014472
#[allow(clippy::single_range_in_vec_init)]
44024473
mod tests {
4474+
use super::{
4475+
ChunkInstructions, DataBlock, DecodeMiniBlockTask, FixedPerValueDecompressor,
4476+
FixedWidthDataBlock, FullZipCacheableState, FullZipDecodeDetails, FullZipRepIndexDetails,
4477+
FullZipScheduler, MiniBlockRepIndex, PerValueDecompressor, PreambleAction,
4478+
StructuralPageScheduler,
4479+
};
44034480
use crate::constants::{STRUCTURAL_ENCODING_META_KEY, STRUCTURAL_ENCODING_MINIBLOCK};
4481+
use crate::data::BlockInfo;
44044482
use crate::decoder::PageEncoding;
44054483
use crate::encodings::logical::primitive::{
44064484
ChunkDrainInstructions, PrimitiveStructuralEncoder,
@@ -4409,18 +4487,11 @@ mod tests {
44094487
use crate::format::pb21::compressive_encoding::Compression;
44104488
use crate::testing::{check_round_trip_encoding_of_data, TestCases};
44114489
use crate::version::LanceFileVersion;
4412-
use arrow_array::{ArrayRef, Int8Array, StringArray};
4490+
use arrow_array::{ArrayRef, Int8Array, StringArray, UInt64Array};
44134491
use arrow_schema::DataType;
44144492
use std::collections::HashMap;
44154493
use std::{collections::VecDeque, sync::Arc};
44164494

4417-
use super::{
4418-
ChunkInstructions, DataBlock, DecodeMiniBlockTask, FixedPerValueDecompressor,
4419-
FixedWidthDataBlock, FullZipCacheableState, FullZipDecodeDetails, FullZipRepIndexDetails,
4420-
FullZipScheduler, MiniBlockRepIndex, PerValueDecompressor, PreambleAction,
4421-
StructuralPageScheduler,
4422-
};
4423-
44244495
#[test]
44254496
fn test_is_narrow() {
44264497
let int8_array = Int8Array::from(vec![1, 2, 3]);
@@ -5439,4 +5510,125 @@ mod tests {
54395510

54405511
check_round_trip_encoding_of_data(vec![string_array], &test_cases, HashMap::new()).await;
54415512
}
5513+
5514+
// Dictionary encoding decision tests
5515+
/// Helper to create FixedWidth test data block with exact cardinality stat injected
5516+
/// to ensure consistent test behavior (avoids HLL estimation error)
5517+
fn create_test_fixed_data_block(num_values: u64, cardinality: u64) -> DataBlock {
5518+
use crate::statistics::Stat;
5519+
5520+
let block_info = BlockInfo::default();
5521+
5522+
// Manually inject exact cardinality stat for consistent test behavior
5523+
let cardinality_array = Arc::new(UInt64Array::from(vec![cardinality]));
5524+
block_info
5525+
.0
5526+
.write()
5527+
.unwrap()
5528+
.insert(Stat::Cardinality, cardinality_array);
5529+
5530+
DataBlock::FixedWidth(FixedWidthDataBlock {
5531+
bits_per_value: 32,
5532+
data: crate::buffer::LanceBuffer::from(vec![0u8; (num_values * 4) as usize]),
5533+
num_values,
5534+
block_info,
5535+
})
5536+
}
5537+
5538+
/// Helper to create VariableWidth (string) test data block with exact cardinality
5539+
fn create_test_variable_width_block(num_values: u64, cardinality: u64) -> DataBlock {
5540+
use crate::statistics::Stat;
5541+
use arrow_array::StringArray;
5542+
5543+
assert!(cardinality <= num_values && cardinality > 0);
5544+
5545+
let mut values = Vec::with_capacity(num_values as usize);
5546+
for i in 0..num_values {
5547+
values.push(format!("value_{:016}", i % cardinality));
5548+
}
5549+
5550+
let array = StringArray::from(values);
5551+
let block = DataBlock::from_array(Arc::new(array) as ArrayRef);
5552+
5553+
// Manually inject stats for consistent test behavior
5554+
if let DataBlock::VariableWidth(ref var_block) = block {
5555+
let mut info = var_block.block_info.0.write().unwrap();
5556+
// Cardinality: exact value to avoid HLL estimation error
5557+
info.insert(
5558+
Stat::Cardinality,
5559+
Arc::new(UInt64Array::from(vec![cardinality])),
5560+
);
5561+
}
5562+
5563+
block
5564+
}
5565+
5566+
#[test]
5567+
fn test_estimate_dict_size_fixed_width() {
5568+
use crate::encodings::logical::primitive::dict::{
5569+
DICT_FIXED_WIDTH_BITS_PER_VALUE, DICT_INDICES_BITS_PER_VALUE,
5570+
};
5571+
5572+
let block = create_test_fixed_data_block(1000, 400);
5573+
let estimated_size = PrimitiveStructuralEncoder::estimate_dict_size(&block).unwrap();
5574+
5575+
// Dictionary: 400 * 16 bytes (128-bit values)
5576+
// Indices: 1000 * 4 bytes (32-bit i32)
5577+
let expected_dict_size = 400 * (DICT_FIXED_WIDTH_BITS_PER_VALUE / 8);
5578+
let expected_indices_size = 1000 * (DICT_INDICES_BITS_PER_VALUE / 8);
5579+
let expected_total = expected_dict_size + expected_indices_size;
5580+
5581+
assert_eq!(estimated_size, expected_total);
5582+
}
5583+
5584+
#[test]
5585+
fn test_estimate_dict_size_variable_width() {
5586+
let block = create_test_variable_width_block(1000, 400);
5587+
let estimated_size = PrimitiveStructuralEncoder::estimate_dict_size(&block).unwrap();
5588+
5589+
// Get actual data size
5590+
let data_size = block.data_size();
5591+
let avg_value_size = data_size / 1000;
5592+
5593+
let expected = 400 * avg_value_size + 400 * 4 + 1000 * 4;
5594+
5595+
assert_eq!(estimated_size, expected);
5596+
}
5597+
5598+
#[test]
5599+
fn test_should_dictionary_encode() {
5600+
use crate::constants::DICT_SIZE_RATIO_META_KEY;
5601+
use lance_core::datatypes::Field as LanceField;
5602+
5603+
// Create data where dict encoding saves space
5604+
let block = create_test_variable_width_block(1000, 10);
5605+
5606+
let mut metadata = HashMap::new();
5607+
metadata.insert(DICT_SIZE_RATIO_META_KEY.to_string(), "0.8".to_string());
5608+
let arrow_field =
5609+
arrow_schema::Field::new("test", DataType::Int32, false).with_metadata(metadata);
5610+
let field = LanceField::try_from(&arrow_field).unwrap();
5611+
5612+
let result = PrimitiveStructuralEncoder::should_dictionary_encode(&block, &field);
5613+
5614+
assert!(result, "Should use dictionary encode based on size");
5615+
}
5616+
5617+
#[test]
5618+
fn test_should_not_dictionary_encode() {
5619+
use crate::constants::DICT_SIZE_RATIO_META_KEY;
5620+
use lance_core::datatypes::Field as LanceField;
5621+
5622+
let block = create_test_fixed_data_block(1000, 10);
5623+
5624+
let mut metadata = HashMap::new();
5625+
metadata.insert(DICT_SIZE_RATIO_META_KEY.to_string(), "0.8".to_string());
5626+
let arrow_field =
5627+
arrow_schema::Field::new("test", DataType::Int32, false).with_metadata(metadata);
5628+
let field = LanceField::try_from(&arrow_field).unwrap();
5629+
5630+
let result = PrimitiveStructuralEncoder::should_dictionary_encode(&block, &field);
5631+
5632+
assert!(!result, "Should not use dictionary encode based on size");
5633+
}
54425634
}

rust/lance-encoding/src/encodings/logical/primitive/dict.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@
33

44
use std::{collections::HashMap, sync::Arc};
55

6+
/// Bits per value for FixedWidth dictionary values (currently only 128-bit is supported)
7+
pub const DICT_FIXED_WIDTH_BITS_PER_VALUE: u64 = 128;
8+
/// Bits per index for dictionary indices (always i32)
9+
pub const DICT_INDICES_BITS_PER_VALUE: u64 = 32;
10+
611
use arrow_array::{
712
cast::AsArray,
813
types::{
@@ -138,13 +143,13 @@ pub fn dictionary_encode(mut data_block: DataBlock) -> (DataBlock, DataBlock) {
138143
});
139144
let dictionary_data_block = DataBlock::FixedWidth(FixedWidthDataBlock {
140145
data: LanceBuffer::reinterpret_vec(dictionary_buffer),
141-
bits_per_value: 128,
146+
bits_per_value: DICT_FIXED_WIDTH_BITS_PER_VALUE,
142147
num_values: curr_idx as u64,
143148
block_info: BlockInfo::default(),
144149
});
145150
let mut indices_data_block = DataBlock::FixedWidth(FixedWidthDataBlock {
146151
data: LanceBuffer::reinterpret_vec(indices_buffer),
147-
bits_per_value: 32,
152+
bits_per_value: DICT_INDICES_BITS_PER_VALUE,
148153
num_values: fixed_width_data_block.num_values,
149154
block_info: BlockInfo::default(),
150155
});

rust/lance-encoding/src/encodings/physical/block.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,7 @@ mod tests {
731731

732732
use super::*;
733733

734+
use crate::constants::DICT_SIZE_RATIO_META_KEY;
734735
use crate::{
735736
constants::{
736737
COMPRESSION_META_KEY, DICT_DIVISOR_META_KEY, STRUCTURAL_ENCODING_FULLZIP,
@@ -771,6 +772,8 @@ mod tests {
771772
// Some bad cardinality estimatation causes us to use dictionary encoding currently
772773
// which causes the expected encoding check to fail.
773774
field_meta.insert(DICT_DIVISOR_META_KEY.to_string(), "100000".to_string());
775+
field_meta.insert(DICT_SIZE_RATIO_META_KEY.to_string(), "0.0001".to_string());
776+
// Also disable size-based dictionary encoding
774777
field_meta.insert(
775778
STRUCTURAL_ENCODING_META_KEY.to_string(),
776779
STRUCTURAL_ENCODING_FULLZIP.to_string(),

0 commit comments

Comments
 (0)