@@ -14,8 +14,7 @@ use std::{
1414
1515use crate :: {
1616 constants:: {
17- DICT_DIVISOR_META_KEY , STRUCTURAL_ENCODING_FULLZIP , STRUCTURAL_ENCODING_META_KEY ,
18- STRUCTURAL_ENCODING_MINIBLOCK ,
17+ STRUCTURAL_ENCODING_FULLZIP , STRUCTURAL_ENCODING_META_KEY , STRUCTURAL_ENCODING_MINIBLOCK ,
1918 } ,
2019 data:: DictionaryDataBlock ,
2120 encodings:: logical:: primitive:: blob:: { BlobDescriptionPageScheduler , BlobPageScheduler } ,
@@ -65,6 +64,10 @@ use crate::{
6564} ;
6665use lance_core:: { datatypes:: Field , utils:: tokio:: spawn_cpu, Result } ;
6766
67+ use crate :: constants:: DICT_SIZE_RATIO_META_KEY ;
68+ use crate :: encodings:: logical:: primitive:: dict:: {
69+ DICT_FIXED_WIDTH_BITS_PER_VALUE , DICT_INDICES_BITS_PER_VALUE ,
70+ } ;
6871use crate :: {
6972 buffer:: LanceBuffer ,
7073 data:: { BlockInfo , DataBlockBuilder , FixedWidthDataBlock } ,
@@ -4117,7 +4120,70 @@ impl PrimitiveStructuralEncoder {
41174120 } )
41184121 }
41194122
4123+ /// Estimates the total size of dictionary-encoded data
4124+ ///
4125+ /// Dictionary encoding splits data into two parts:
4126+ /// 1. Dictionary: stores unique values
4127+ /// 2. Indices: maps each value to a dictionary entry
4128+ ///
4129+ /// For FixedWidth (e.g., 128-bit Decimal):
4130+ /// - Dictionary: cardinality × 16 bytes (128 bits per value)
4131+ /// - Indices: num_values × 4 bytes (32-bit i32)
4132+ ///
4133+ /// For VariableWidth (strings/binary):
4134+ /// - Dictionary values: cardinality × avg_value_size (actual data)
4135+ /// - Dictionary offsets: cardinality × offset_size (32 or 64 bits)
4136+ /// - Indices: num_values × offset_size (same as dictionary offsets)
4137+ fn estimate_dict_size ( data_block : & DataBlock ) -> Option < u64 > {
4138+ let cardinality = if let Some ( cardinality_array) = data_block. get_stat ( Stat :: Cardinality ) {
4139+ cardinality_array. as_primitive :: < UInt64Type > ( ) . value ( 0 )
4140+ } else {
4141+ return None ;
4142+ } ;
4143+
4144+ let num_values = data_block. num_values ( ) ;
4145+
4146+ match data_block {
4147+ DataBlock :: FixedWidth ( _) => {
4148+ // Dictionary: cardinality unique values at 128 bits each
4149+ let dict_size = cardinality * ( DICT_FIXED_WIDTH_BITS_PER_VALUE / 8 ) ;
4150+ // Indices: num_values indices at 32 bits each
4151+ let indices_size = num_values * ( DICT_INDICES_BITS_PER_VALUE / 8 ) ;
4152+ Some ( dict_size + indices_size)
4153+ }
4154+ DataBlock :: VariableWidth ( var) => {
4155+ // Only 32-bit and 64-bit offsets are supported
4156+ if var. bits_per_offset != 32 && var. bits_per_offset != 64 {
4157+ return None ;
4158+ }
4159+ let bits_per_offset = var. bits_per_offset as u64 ;
4160+
4161+ let data_size = data_block. data_size ( ) ;
4162+ let avg_value_size = data_size / num_values;
4163+
4164+ // Dictionary values: actual bytes of unique strings/binary
4165+ let dict_values_size = cardinality * avg_value_size;
4166+ // Dictionary offsets: pointers into dictionary values
4167+ let dict_offsets_size = cardinality * ( bits_per_offset / 8 ) ;
4168+ // Indices: map each row to dictionary entry
4169+ let indices_size = num_values * ( bits_per_offset / 8 ) ;
4170+
4171+ Some ( dict_values_size + dict_offsets_size + indices_size)
4172+ }
4173+ _ => None ,
4174+ }
4175+ }
4176+
41204177 fn should_dictionary_encode ( data_block : & DataBlock , field : & Field ) -> bool {
4178+ // Since we only dictionary encode FixedWidth and VariableWidth blocks for now, we skip
4179+ // estimating the size
4180+ if !matches ! (
4181+ data_block,
4182+ DataBlock :: FixedWidth ( _) | DataBlock :: VariableWidth ( _)
4183+ ) {
4184+ return false ;
4185+ }
4186+
41214187 // Don't dictionary encode tiny arrays
41224188 let too_small = env:: var ( "LANCE_ENCODING_DICT_TOO_SMALL" )
41234189 . ok ( )
@@ -4127,35 +4193,40 @@ impl PrimitiveStructuralEncoder {
41274193 return false ;
41284194 }
41294195
4130- // Somewhat arbitrary threshold rule. Apply dictionary encoding if the number of unique
4131- // values is less than 1/2 the total number of values.
4132- let divisor: u64 = field
4196+ // Get size ratio from metadata or env var, default to 0.8
4197+ let threshold_ratio = field
41334198 . metadata
4134- . get ( DICT_DIVISOR_META_KEY )
4135- . map ( |val| val. parse ( ) . ok ( ) )
4136- . unwrap_or_else ( || {
4137- env:: var ( "LANCE_ENCODING_DICT_DIVISOR " )
4199+ . get ( DICT_SIZE_RATIO_META_KEY )
4200+ . and_then ( |val| val. parse :: < f64 > ( ) . ok ( ) )
4201+ . or_else ( || {
4202+ env:: var ( "LANCE_ENCODING_DICT_SIZE_RATIO " )
41384203 . ok ( )
41394204 . and_then ( |val| val. parse ( ) . ok ( ) )
41404205 } )
4141- . unwrap_or ( 2 ) ;
4206+ . unwrap_or ( 0.8 ) ;
41424207
4143- // Cap on cardinality. This should be pushed into the cardinality estimation to avoid
4144- // spending too much time estimating cardinality.
4145- let max_cardinality = env:: var ( "LANCE_ENCODING_DICT_MAX_CARDINALITY" )
4146- . ok ( )
4147- . and_then ( |val| val. parse ( ) . ok ( ) )
4148- . unwrap_or ( 100000 ) ;
4208+ // Validate size ratio is in valid range
4209+ if threshold_ratio <= 0.0 || threshold_ratio > 1.0 {
4210+ panic ! (
4211+ "Invalid parameter: dict-size-ratio is {} which is not in the range (0, 1]." ,
4212+ threshold_ratio
4213+ ) ;
4214+ }
41494215
4150- let threshold = ( data_block. num_values ( ) / divisor) . min ( max_cardinality) ;
4216+ // Get raw data size
4217+ let data_size = data_block. data_size ( ) ;
41514218
4152- let cardinality = if let Some ( cardinality_array) = data_block. get_stat ( Stat :: Cardinality ) {
4153- cardinality_array. as_primitive :: < UInt64Type > ( ) . value ( 0 )
4154- } else {
4155- u64:: MAX
4219+ // Estimate dictionary-encoded size
4220+ let Some ( encoded_size) = Self :: estimate_dict_size ( data_block) else {
4221+ return false ;
41564222 } ;
41574223
4158- cardinality < threshold
4224+ let size_ratio_actual = if data_size > 0 {
4225+ encoded_size as f64 / data_size as f64
4226+ } else {
4227+ return false ;
4228+ } ;
4229+ size_ratio_actual < threshold_ratio
41594230 }
41604231
41614232 // Creates an encode task, consuming all buffered data
@@ -4400,7 +4471,14 @@ impl FieldEncoder for PrimitiveStructuralEncoder {
44004471#[ cfg( test) ]
44014472#[ allow( clippy:: single_range_in_vec_init) ]
44024473mod tests {
4474+ use super :: {
4475+ ChunkInstructions , DataBlock , DecodeMiniBlockTask , FixedPerValueDecompressor ,
4476+ FixedWidthDataBlock , FullZipCacheableState , FullZipDecodeDetails , FullZipRepIndexDetails ,
4477+ FullZipScheduler , MiniBlockRepIndex , PerValueDecompressor , PreambleAction ,
4478+ StructuralPageScheduler ,
4479+ } ;
44034480 use crate :: constants:: { STRUCTURAL_ENCODING_META_KEY , STRUCTURAL_ENCODING_MINIBLOCK } ;
4481+ use crate :: data:: BlockInfo ;
44044482 use crate :: decoder:: PageEncoding ;
44054483 use crate :: encodings:: logical:: primitive:: {
44064484 ChunkDrainInstructions , PrimitiveStructuralEncoder ,
@@ -4409,18 +4487,11 @@ mod tests {
44094487 use crate :: format:: pb21:: compressive_encoding:: Compression ;
44104488 use crate :: testing:: { check_round_trip_encoding_of_data, TestCases } ;
44114489 use crate :: version:: LanceFileVersion ;
4412- use arrow_array:: { ArrayRef , Int8Array , StringArray } ;
4490+ use arrow_array:: { ArrayRef , Int8Array , StringArray , UInt64Array } ;
44134491 use arrow_schema:: DataType ;
44144492 use std:: collections:: HashMap ;
44154493 use std:: { collections:: VecDeque , sync:: Arc } ;
44164494
4417- use super :: {
4418- ChunkInstructions , DataBlock , DecodeMiniBlockTask , FixedPerValueDecompressor ,
4419- FixedWidthDataBlock , FullZipCacheableState , FullZipDecodeDetails , FullZipRepIndexDetails ,
4420- FullZipScheduler , MiniBlockRepIndex , PerValueDecompressor , PreambleAction ,
4421- StructuralPageScheduler ,
4422- } ;
4423-
44244495 #[ test]
44254496 fn test_is_narrow ( ) {
44264497 let int8_array = Int8Array :: from ( vec ! [ 1 , 2 , 3 ] ) ;
@@ -5439,4 +5510,125 @@ mod tests {
54395510
54405511 check_round_trip_encoding_of_data ( vec ! [ string_array] , & test_cases, HashMap :: new ( ) ) . await ;
54415512 }
5513+
5514+ // Dictionary encoding decision tests
5515+ /// Helper to create FixedWidth test data block with exact cardinality stat injected
5516+ /// to ensure consistent test behavior (avoids HLL estimation error)
5517+ fn create_test_fixed_data_block ( num_values : u64 , cardinality : u64 ) -> DataBlock {
5518+ use crate :: statistics:: Stat ;
5519+
5520+ let block_info = BlockInfo :: default ( ) ;
5521+
5522+ // Manually inject exact cardinality stat for consistent test behavior
5523+ let cardinality_array = Arc :: new ( UInt64Array :: from ( vec ! [ cardinality] ) ) ;
5524+ block_info
5525+ . 0
5526+ . write ( )
5527+ . unwrap ( )
5528+ . insert ( Stat :: Cardinality , cardinality_array) ;
5529+
5530+ DataBlock :: FixedWidth ( FixedWidthDataBlock {
5531+ bits_per_value : 32 ,
5532+ data : crate :: buffer:: LanceBuffer :: from ( vec ! [ 0u8 ; ( num_values * 4 ) as usize ] ) ,
5533+ num_values,
5534+ block_info,
5535+ } )
5536+ }
5537+
5538+ /// Helper to create VariableWidth (string) test data block with exact cardinality
5539+ fn create_test_variable_width_block ( num_values : u64 , cardinality : u64 ) -> DataBlock {
5540+ use crate :: statistics:: Stat ;
5541+ use arrow_array:: StringArray ;
5542+
5543+ assert ! ( cardinality <= num_values && cardinality > 0 ) ;
5544+
5545+ let mut values = Vec :: with_capacity ( num_values as usize ) ;
5546+ for i in 0 ..num_values {
5547+ values. push ( format ! ( "value_{:016}" , i % cardinality) ) ;
5548+ }
5549+
5550+ let array = StringArray :: from ( values) ;
5551+ let block = DataBlock :: from_array ( Arc :: new ( array) as ArrayRef ) ;
5552+
5553+ // Manually inject stats for consistent test behavior
5554+ if let DataBlock :: VariableWidth ( ref var_block) = block {
5555+ let mut info = var_block. block_info . 0 . write ( ) . unwrap ( ) ;
5556+ // Cardinality: exact value to avoid HLL estimation error
5557+ info. insert (
5558+ Stat :: Cardinality ,
5559+ Arc :: new ( UInt64Array :: from ( vec ! [ cardinality] ) ) ,
5560+ ) ;
5561+ }
5562+
5563+ block
5564+ }
5565+
5566+ #[ test]
5567+ fn test_estimate_dict_size_fixed_width ( ) {
5568+ use crate :: encodings:: logical:: primitive:: dict:: {
5569+ DICT_FIXED_WIDTH_BITS_PER_VALUE , DICT_INDICES_BITS_PER_VALUE ,
5570+ } ;
5571+
5572+ let block = create_test_fixed_data_block ( 1000 , 400 ) ;
5573+ let estimated_size = PrimitiveStructuralEncoder :: estimate_dict_size ( & block) . unwrap ( ) ;
5574+
5575+ // Dictionary: 400 * 16 bytes (128-bit values)
5576+ // Indices: 1000 * 4 bytes (32-bit i32)
5577+ let expected_dict_size = 400 * ( DICT_FIXED_WIDTH_BITS_PER_VALUE / 8 ) ;
5578+ let expected_indices_size = 1000 * ( DICT_INDICES_BITS_PER_VALUE / 8 ) ;
5579+ let expected_total = expected_dict_size + expected_indices_size;
5580+
5581+ assert_eq ! ( estimated_size, expected_total) ;
5582+ }
5583+
5584+ #[ test]
5585+ fn test_estimate_dict_size_variable_width ( ) {
5586+ let block = create_test_variable_width_block ( 1000 , 400 ) ;
5587+ let estimated_size = PrimitiveStructuralEncoder :: estimate_dict_size ( & block) . unwrap ( ) ;
5588+
5589+ // Get actual data size
5590+ let data_size = block. data_size ( ) ;
5591+ let avg_value_size = data_size / 1000 ;
5592+
5593+ let expected = 400 * avg_value_size + 400 * 4 + 1000 * 4 ;
5594+
5595+ assert_eq ! ( estimated_size, expected) ;
5596+ }
5597+
5598+ #[ test]
5599+ fn test_should_dictionary_encode ( ) {
5600+ use crate :: constants:: DICT_SIZE_RATIO_META_KEY ;
5601+ use lance_core:: datatypes:: Field as LanceField ;
5602+
5603+ // Create data where dict encoding saves space
5604+ let block = create_test_variable_width_block ( 1000 , 10 ) ;
5605+
5606+ let mut metadata = HashMap :: new ( ) ;
5607+ metadata. insert ( DICT_SIZE_RATIO_META_KEY . to_string ( ) , "0.8" . to_string ( ) ) ;
5608+ let arrow_field =
5609+ arrow_schema:: Field :: new ( "test" , DataType :: Int32 , false ) . with_metadata ( metadata) ;
5610+ let field = LanceField :: try_from ( & arrow_field) . unwrap ( ) ;
5611+
5612+ let result = PrimitiveStructuralEncoder :: should_dictionary_encode ( & block, & field) ;
5613+
5614+ assert ! ( result, "Should use dictionary encode based on size" ) ;
5615+ }
5616+
5617+ #[ test]
5618+ fn test_should_not_dictionary_encode ( ) {
5619+ use crate :: constants:: DICT_SIZE_RATIO_META_KEY ;
5620+ use lance_core:: datatypes:: Field as LanceField ;
5621+
5622+ let block = create_test_fixed_data_block ( 1000 , 10 ) ;
5623+
5624+ let mut metadata = HashMap :: new ( ) ;
5625+ metadata. insert ( DICT_SIZE_RATIO_META_KEY . to_string ( ) , "0.8" . to_string ( ) ) ;
5626+ let arrow_field =
5627+ arrow_schema:: Field :: new ( "test" , DataType :: Int32 , false ) . with_metadata ( metadata) ;
5628+ let field = LanceField :: try_from ( & arrow_field) . unwrap ( ) ;
5629+
5630+ let result = PrimitiveStructuralEncoder :: should_dictionary_encode ( & block, & field) ;
5631+
5632+ assert ! ( !result, "Should not use dictionary encode based on size" ) ;
5633+ }
54425634}
0 commit comments