@@ -30,7 +30,30 @@ use crate::types::bytes::ByteArrayNativeType;
3030use  crate :: types:: { BinaryViewType ,  ByteViewType ,  StringViewType } ; 
3131use  crate :: { ArrayRef ,  GenericByteViewArray } ; 
3232
33- const  DEFAULT_BLOCK_SIZE :  u32  = 8  *  1024 ; 
33+ const  STARTING_BLOCK_SIZE :  u32  = 8  *  1024 ;  // 8KiB 
34+ const  MAX_BLOCK_SIZE :  u32  = 2  *  1024  *  1024 ;  // 2MiB 
35+ 
36+ enum  BlockSizeGrowthStrategy  { 
37+     Fixed  {  size :  u32  } , 
38+     Exponential  {  current_size :  u32  } , 
39+ } 
40+ 
41+ impl  BlockSizeGrowthStrategy  { 
42+     fn  next_size ( & mut  self )  -> u32  { 
43+         match  self  { 
44+             Self :: Fixed  {  size }  => * size, 
45+             Self :: Exponential  {  current_size }  => { 
46+                 if  * current_size < MAX_BLOCK_SIZE  { 
47+                     // we have fixed start/end block sizes, so we can't overflow 
48+                     * current_size = current_size. saturating_mul ( 2 ) ; 
49+                     * current_size
50+                 }  else  { 
51+                     MAX_BLOCK_SIZE 
52+                 } 
53+             } 
54+         } 
55+     } 
56+ } 
3457
3558/// A builder for [`GenericByteViewArray`] 
3659/// 
@@ -58,7 +81,7 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
5881    null_buffer_builder :  NullBufferBuilder , 
5982    completed :  Vec < Buffer > , 
6083    in_progress :  Vec < u8 > , 
61-     block_size :  u32 , 
84+     block_size :  BlockSizeGrowthStrategy , 
6285    /// Some if deduplicating strings 
6386/// map `<string hash> -> <index to the views>` 
6487string_tracker :  Option < ( HashTable < usize > ,  ahash:: RandomState ) > , 
@@ -78,15 +101,42 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
78101            null_buffer_builder :  NullBufferBuilder :: new ( capacity) , 
79102            completed :  vec ! [ ] , 
80103            in_progress :  vec ! [ ] , 
81-             block_size :  DEFAULT_BLOCK_SIZE , 
104+             block_size :  BlockSizeGrowthStrategy :: Exponential  { 
105+                 current_size :  STARTING_BLOCK_SIZE , 
106+             } , 
82107            string_tracker :  None , 
83108            phantom :  Default :: default ( ) , 
84109        } 
85110    } 
86111
112+     /// Set a fixed buffer size for variable length strings 
113+ /// 
114+ /// The block size is the size of the buffer used to store values greater 
115+ /// than 12 bytes. The builder allocates new buffers when the current 
116+ /// buffer is full. 
117+ /// 
118+ /// By default the builder balances buffer size and buffer count by 
119+ /// growing buffer size exponentially from 8KB up to 2MB. The 
120+ /// first buffer allocated is 8KB, then 16KB, then 32KB, etc up to 2MB. 
121+ /// 
122+ /// If this method is used, any new buffers allocated are   
123+ /// exactly this size. This can be useful for advanced users 
124+ /// that want to control the memory usage and buffer count. 
125+ /// 
126+ /// See <https://github.com/apache/arrow-rs/issues/6094> for more details on the implications. 
127+ pub  fn  with_fixed_block_size ( self ,  block_size :  u32 )  -> Self  { 
128+         debug_assert ! ( block_size > 0 ,  "Block size must be greater than 0" ) ; 
129+         Self  { 
130+             block_size :  BlockSizeGrowthStrategy :: Fixed  {  size :  block_size } , 
131+             ..self 
132+         } 
133+     } 
134+ 
87135    /// Override the size of buffers to allocate for holding string data 
136+ /// Use `with_fixed_block_size` instead. 
137+ #[ deprecated( note = "Use `with_fixed_block_size` instead" ) ]  
88138    pub  fn  with_block_size ( self ,  block_size :  u32 )  -> Self  { 
89-         Self   {  block_size ,  .. self   } 
139+         self . with_fixed_block_size ( block_size ) 
90140    } 
91141
92142    /// Deduplicate strings while building the array 
@@ -277,7 +327,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
277327        let  required_cap = self . in_progress . len ( )  + v. len ( ) ; 
278328        if  self . in_progress . capacity ( )  < required_cap { 
279329            self . flush_in_progress ( ) ; 
280-             let  to_reserve = v. len ( ) . max ( self . block_size  as  usize ) ; 
330+             let  to_reserve = v. len ( ) . max ( self . block_size . next_size ( )  as  usize ) ; 
281331            self . in_progress . reserve ( to_reserve) ; 
282332        } ; 
283333        let  offset = self . in_progress . len ( )  as  u32 ; 
@@ -478,7 +528,7 @@ mod tests {
478528
479529        let  mut  builder = StringViewBuilder :: new ( ) 
480530            . with_deduplicate_strings ( ) 
481-             . with_block_size ( value_1. len ( )  as  u32  *  2 ) ;  // so that we will have multiple buffers 
531+             . with_fixed_block_size ( value_1. len ( )  as  u32  *  2 ) ;  // so that we will have multiple buffers 
482532
483533        let  values = vec ! [ 
484534            Some ( value_1) , 
@@ -585,4 +635,46 @@ mod tests {
585635            "Invalid argument error: No block found with index 5" 
586636        ) ; 
587637    } 
638+ 
639+     #[ test]  
640+     fn  test_string_view_with_block_size_growth ( )  { 
641+         let  mut  exp_builder = StringViewBuilder :: new ( ) ; 
642+         let  mut  fixed_builder = StringViewBuilder :: new ( ) . with_fixed_block_size ( STARTING_BLOCK_SIZE ) ; 
643+ 
644+         let  long_string = String :: from_utf8 ( vec ! [ b'a' ;  STARTING_BLOCK_SIZE  as  usize ] ) . unwrap ( ) ; 
645+ 
646+         for  i in  0 ..9  { 
647+             // 8k, 16k, 32k, 64k, 128k, 256k, 512k, 1M, 2M 
648+             for  _ in  0 ..( 2_u32 . pow ( i) )  { 
649+                 exp_builder. append_value ( & long_string) ; 
650+                 fixed_builder. append_value ( & long_string) ; 
651+             } 
652+             exp_builder. flush_in_progress ( ) ; 
653+             fixed_builder. flush_in_progress ( ) ; 
654+ 
655+             // Every step only add one buffer, but the buffer size is much larger 
656+             assert_eq ! ( exp_builder. completed. len( ) ,  i as  usize  + 1 ) ; 
657+             assert_eq ! ( 
658+                 exp_builder. completed[ i as  usize ] . len( ) , 
659+                 STARTING_BLOCK_SIZE  as  usize  *  2_usize . pow( i) 
660+             ) ; 
661+ 
662+             // This step we added 2^i blocks, the sum of blocks should be 2^(i+1) - 1 
663+             assert_eq ! ( fixed_builder. completed. len( ) ,  2_usize . pow( i + 1 )  - 1 ) ; 
664+ 
665+             // Every buffer is fixed size 
666+             assert ! ( fixed_builder
667+                 . completed
668+                 . iter( ) 
669+                 . all( |b| b. len( )  == STARTING_BLOCK_SIZE  as  usize ) ) ; 
670+         } 
671+ 
672+         // Add one more value, and the buffer stop growing. 
673+         exp_builder. append_value ( & long_string) ; 
674+         exp_builder. flush_in_progress ( ) ; 
675+         assert_eq ! ( 
676+             exp_builder. completed. last( ) . unwrap( ) . capacity( ) , 
677+             MAX_BLOCK_SIZE  as  usize 
678+         ) ; 
679+     } 
588680} 
0 commit comments