@@ -25,11 +25,22 @@ use crate::error::Result;
2525use crate :: spec:: { DataFile , ManifestEntry , ManifestFile , Operation } ;
2626use crate :: table:: Table ;
2727use crate :: transaction:: snapshot:: {
28- generate_unique_snapshot_id, DefaultManifestProcess , SnapshotProduceOperation , SnapshotProducer ,
28+ generate_unique_snapshot_id, DefaultManifestProcess , MergeManifestProcess ,
29+ SnapshotProduceOperation , SnapshotProducer ,
2930} ;
3031use crate :: transaction:: { ActionCommit , TransactionAction } ;
3132use crate :: { Error , ErrorKind } ;
3233
34+ /// Target size of manifest file when merging manifests.
35+ pub const MANIFEST_TARGET_SIZE_BYTES : & str = "commit.manifest.target-size-bytes" ;
36+ const MANIFEST_TARGET_SIZE_BYTES_DEFAULT : u32 = 8 * 1024 * 1024 ; // 8 MB
37+ /// Minimum number of manifests to merge.
38+ pub const MANIFEST_MIN_MERGE_COUNT : & str = "commit.manifest.min-count-to-merge" ;
39+ const MANIFEST_MIN_MERGE_COUNT_DEFAULT : u32 = 100 ;
40+ /// Whether allow to merge manifests.
41+ pub const MANIFEST_MERGE_ENABLED : & str = "commit.manifest-merge.enabled" ;
42+ const MANIFEST_MERGE_ENABLED_DEFAULT : bool = false ;
43+
3344/// FastAppendAction is a transaction action for fast append data files to the table.
3445pub struct FastAppendAction {
3546 check_duplicate : bool ,
@@ -189,6 +200,141 @@ impl SnapshotProduceOperation for FastAppendOperation {
189200 }
190201}
191202
203+ /// MergeAppendAction is a transaction action similar to fast append except that it will merge manifests
204+ /// based on the target size.
205+ pub struct MergeAppendAction {
206+ // snapshot_produce_action: SnapshotProducer<'_>,
207+ target_size_bytes : u32 ,
208+ min_count_to_merge : u32 ,
209+ merge_enabled : bool ,
210+
211+ check_duplicate : bool ,
212+ // below are properties used to create SnapshotProducer when commit
213+ commit_uuid : Option < Uuid > ,
214+ key_metadata : Option < Vec < u8 > > ,
215+ snapshot_properties : HashMap < String , String > ,
216+ added_data_files : Vec < DataFile > ,
217+ added_delete_files : Vec < DataFile > ,
218+ snapshot_id : Option < i64 > ,
219+ }
220+
221+ impl MergeAppendAction {
222+ #[ allow( clippy:: too_many_arguments) ]
223+ pub ( crate ) fn new ( ) -> Self {
224+ Self {
225+ target_size_bytes : MANIFEST_TARGET_SIZE_BYTES_DEFAULT ,
226+ min_count_to_merge : MANIFEST_MIN_MERGE_COUNT_DEFAULT ,
227+ merge_enabled : MANIFEST_MERGE_ENABLED_DEFAULT ,
228+ check_duplicate : true ,
229+ commit_uuid : None ,
230+ key_metadata : None ,
231+ snapshot_properties : HashMap :: default ( ) ,
232+ added_data_files : vec ! [ ] ,
233+ added_delete_files : vec ! [ ] ,
234+ snapshot_id : None ,
235+ }
236+ }
237+
238+ pub fn set_target_size_bytes ( mut self , v : u32 ) -> Self {
239+ self . target_size_bytes = v;
240+ self
241+ }
242+
243+ pub fn set_min_count_to_merge ( mut self , v : u32 ) -> Self {
244+ self . min_count_to_merge = v;
245+ self
246+ }
247+
248+ pub fn set_merge_enabled ( mut self , v : bool ) -> Self {
249+ self . merge_enabled = v;
250+ self
251+ }
252+
253+ pub fn set_snapshot_properties ( mut self , snapshot_properties : HashMap < String , String > ) -> Self {
254+ let target_size_bytes: u32 = snapshot_properties
255+ . get ( MANIFEST_TARGET_SIZE_BYTES )
256+ . and_then ( |s| s. parse ( ) . ok ( ) )
257+ . unwrap_or ( MANIFEST_TARGET_SIZE_BYTES_DEFAULT ) ;
258+ let min_count_to_merge: u32 = snapshot_properties
259+ . get ( MANIFEST_MIN_MERGE_COUNT )
260+ . and_then ( |s| s. parse ( ) . ok ( ) )
261+ . unwrap_or ( MANIFEST_MIN_MERGE_COUNT_DEFAULT ) ;
262+ let merge_enabled = snapshot_properties
263+ . get ( MANIFEST_MERGE_ENABLED )
264+ . and_then ( |s| s. parse ( ) . ok ( ) )
265+ . unwrap_or ( MANIFEST_MERGE_ENABLED_DEFAULT ) ;
266+
267+ self . snapshot_properties = snapshot_properties;
268+ self . target_size_bytes = target_size_bytes;
269+ self . min_count_to_merge = min_count_to_merge;
270+ self . merge_enabled = merge_enabled;
271+
272+ self
273+ }
274+
275+ /// Add data files to the snapshot.
276+ pub fn add_data_files ( mut self , data_files : impl IntoIterator < Item = DataFile > ) -> Self {
277+ self . added_data_files . extend ( data_files) ;
278+ self
279+ }
280+ }
281+
282+ #[ async_trait]
283+ impl TransactionAction for MergeAppendAction {
284+ async fn commit ( self : Arc < Self > , table : & Table ) -> Result < ActionCommit > {
285+ let snapshot_id = if let Some ( snapshot_id) = self . snapshot_id {
286+ if table
287+ . metadata ( )
288+ . snapshots ( )
289+ . any ( |s| s. snapshot_id ( ) == snapshot_id)
290+ {
291+ return Err ( Error :: new (
292+ ErrorKind :: DataInvalid ,
293+ format ! ( "Snapshot id {} already exists" , snapshot_id) ,
294+ ) ) ;
295+ }
296+ snapshot_id
297+ } else {
298+ generate_unique_snapshot_id ( table)
299+ } ;
300+
301+ let snapshot_producer = SnapshotProducer :: new (
302+ table,
303+ self . commit_uuid . unwrap_or_else ( Uuid :: now_v7) ,
304+ self . key_metadata . clone ( ) ,
305+ self . snapshot_properties . clone ( ) ,
306+ self . added_data_files . clone ( ) ,
307+ self . added_delete_files . clone ( ) ,
308+ snapshot_id,
309+ ) ;
310+
311+ // validate added files
312+ snapshot_producer. validate_added_data_files ( & self . added_data_files ) ?;
313+ snapshot_producer. validate_added_data_files ( & self . added_delete_files ) ?;
314+
315+ // Checks duplicate files
316+ if self . check_duplicate {
317+ snapshot_producer
318+ . validate_duplicate_files ( & self . added_data_files )
319+ . await ?;
320+
321+ snapshot_producer
322+ . validate_duplicate_files ( & self . added_delete_files )
323+ . await ?;
324+ }
325+
326+ if self . merge_enabled {
327+ let process =
328+ MergeManifestProcess :: new ( self . target_size_bytes , self . min_count_to_merge ) ;
329+ snapshot_producer. commit ( FastAppendOperation , process) . await
330+ } else {
331+ snapshot_producer
332+ . commit ( FastAppendOperation , DefaultManifestProcess )
333+ . await
334+ }
335+ }
336+ }
337+
192338#[ cfg( test) ]
193339mod tests {
194340 use std:: collections:: HashMap ;
0 commit comments