diff --git a/bindings/python/src/data_file.rs b/bindings/python/src/data_file.rs index 3339b38407..900d6c6014 100644 --- a/bindings/python/src/data_file.rs +++ b/bindings/python/src/data_file.rs @@ -148,7 +148,7 @@ impl PyDataFile { } #[getter] - fn equality_ids(&self) -> &[i32] { + fn equality_ids(&self) -> Option> { self.inner.equality_ids() } diff --git a/bindings/python/tests/test_manifest.py b/bindings/python/tests/test_manifest.py index 0e838c3954..701eac2509 100644 --- a/bindings/python/tests/test_manifest.py +++ b/bindings/python/tests/test_manifest.py @@ -138,5 +138,5 @@ def test_read_manifest_entry(generated_manifest_entry_file: str) -> None: } assert data_file.key_metadata is None assert data_file.split_offsets == [4] - assert data_file.equality_ids == [] + assert data_file.equality_ids is None assert data_file.sort_order_id == 0 diff --git a/crates/iceberg/src/arrow/caching_delete_file_loader.rs b/crates/iceberg/src/arrow/caching_delete_file_loader.rs index c6a8943d78..9cf605680e 100644 --- a/crates/iceberg/src/arrow/caching_delete_file_loader.rs +++ b/crates/iceberg/src/arrow/caching_delete_file_loader.rs @@ -233,7 +233,7 @@ impl CachingDeleteFileLoader { ) .await?, sender, - equality_ids: HashSet::from_iter(task.equality_ids.clone()), + equality_ids: HashSet::from_iter(task.equality_ids.clone().unwrap()), }) } diff --git a/crates/iceberg/src/arrow/delete_filter.rs b/crates/iceberg/src/arrow/delete_filter.rs index 0dd53a34fa..b853baa993 100644 --- a/crates/iceberg/src/arrow/delete_filter.rs +++ b/crates/iceberg/src/arrow/delete_filter.rs @@ -311,21 +311,21 @@ pub(crate) mod tests { file_path: format!("{}/pos-del-1.parquet", table_location.to_str().unwrap()), file_type: DataContentType::PositionDeletes, partition_spec_id: 0, - equality_ids: vec![], + equality_ids: None, }; let pos_del_2 = FileScanTaskDeleteFile { file_path: format!("{}/pos-del-2.parquet", table_location.to_str().unwrap()), file_type: DataContentType::PositionDeletes, partition_spec_id: 0, - equality_ids: vec![], + equality_ids: None, }; let pos_del_3 = FileScanTaskDeleteFile { file_path: format!("{}/pos-del-3.parquet", table_location.to_str().unwrap()), file_type: DataContentType::PositionDeletes, partition_spec_id: 0, - equality_ids: vec![], + equality_ids: None, }; let file_scan_tasks = vec![ diff --git a/crates/iceberg/src/expr/visitors/expression_evaluator.rs b/crates/iceberg/src/expr/visitors/expression_evaluator.rs index 4db1ad7d93..3675ce355f 100644 --- a/crates/iceberg/src/expr/visitors/expression_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/expression_evaluator.rs @@ -347,7 +347,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -375,7 +375,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, diff --git a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs index a00376e1ac..2b65cf12aa 100644 --- a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs @@ -1996,7 +1996,7 @@ mod test { upper_bounds: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -2022,7 +2022,7 @@ mod test { upper_bounds: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -2084,7 +2084,7 @@ mod test { column_sizes: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -2115,7 +2115,7 @@ mod test { column_sizes: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -2147,7 +2147,7 @@ mod test { column_sizes: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -2179,7 +2179,7 @@ mod test { column_sizes: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, diff --git a/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs b/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs index f74ce3a6c3..e17c44c658 100644 --- a/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs @@ -579,7 +579,7 @@ mod test { column_sizes: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -605,7 +605,7 @@ mod test { upper_bounds: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -631,7 +631,7 @@ mod test { column_sizes: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -658,7 +658,7 @@ mod test { column_sizes: Default::default(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, diff --git a/crates/iceberg/src/scan/task.rs b/crates/iceberg/src/scan/task.rs index 7b111e4f04..32fe3ae309 100644 --- a/crates/iceberg/src/scan/task.rs +++ b/crates/iceberg/src/scan/task.rs @@ -112,6 +112,6 @@ pub struct FileScanTaskDeleteFile { /// partition id pub partition_spec_id: i32, - /// equality ids for equality deletes (empty for positional deletes) - pub equality_ids: Vec, + /// equality ids for equality deletes (null for anything other than equality-deletes) + pub equality_ids: Option>, } diff --git a/crates/iceberg/src/spec/manifest/_serde.rs b/crates/iceberg/src/spec/manifest/_serde.rs index 462128cf2b..7738af46d4 100644 --- a/crates/iceberg/src/spec/manifest/_serde.rs +++ b/crates/iceberg/src/spec/manifest/_serde.rs @@ -116,7 +116,6 @@ pub(super) struct DataFileSerde { upper_bounds: Option>, key_metadata: Option, split_offsets: Option>, - #[serde(default)] equality_ids: Option>, sort_order_id: Option, first_row_id: Option, @@ -155,7 +154,7 @@ impl DataFileSerde { upper_bounds: Some(to_bytes_entry(value.upper_bounds)?), key_metadata: value.key_metadata.map(serde_bytes::ByteBuf::from), split_offsets: Some(value.split_offsets), - equality_ids: Some(value.equality_ids), + equality_ids: value.equality_ids, sort_order_id: value.sort_order_id, first_row_id: value.first_row_id, referenced_data_file: value.referenced_data_file, @@ -224,7 +223,7 @@ impl DataFileSerde { .unwrap_or_default(), key_metadata: self.key_metadata.map(|v| v.to_vec()), split_offsets: self.split_offsets.unwrap_or_default(), - equality_ids: self.equality_ids.unwrap_or_default(), + equality_ids: self.equality_ids, sort_order_id: self.sort_order_id, partition_spec_id, first_row_id: self.first_row_id, @@ -382,7 +381,7 @@ mod tests { upper_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]), key_metadata: None, split_offsets: vec![4], - equality_ids: vec![], + equality_ids: None, sort_order_id: Some(0), partition_spec_id: 0, first_row_id: None, @@ -517,9 +516,8 @@ mod tests { "DataFileSerde should convert content 0 to DataContentType::Data" ); assert_eq!( - v2_entry.data_file.equality_ids, - Vec::::new(), - "DataFileSerde should convert None equality_ids to empty vec" + v2_entry.data_file.equality_ids, None, + "DataFileSerde should preserve None equality_ids as None" ); // Verify other fields are preserved during conversion @@ -581,9 +579,8 @@ mod tests { "content 0 should convert to DataContentType::Data" ); assert_eq!( - data_file.equality_ids, - Vec::::new(), - "None equality_ids should convert to empty vec via unwrap_or_default()" + data_file.equality_ids, None, + "None equality_ids should remain as None" ); // Verify other fields are handled correctly during conversion diff --git a/crates/iceberg/src/spec/manifest/data_file.rs b/crates/iceberg/src/spec/manifest/data_file.rs index d191ba2eab..931f9441e9 100644 --- a/crates/iceberg/src/spec/manifest/data_file.rs +++ b/crates/iceberg/src/spec/manifest/data_file.rs @@ -135,7 +135,7 @@ pub struct DataFile { /// otherwise. Fields with ids listed in this column must be present /// in the delete file #[builder(default)] - pub(crate) equality_ids: Vec, + pub(crate) equality_ids: Option>, /// field id: 140 /// /// ID representing sort order for this file. @@ -249,8 +249,8 @@ impl DataFile { /// Get the equality ids of the data file. /// Field ids used to determine row equality in equality delete files. /// null when content is not EqualityDeletes. - pub fn equality_ids(&self) -> &[i32] { - &self.equality_ids + pub fn equality_ids(&self) -> Option> { + self.equality_ids.clone() } /// Get the first row id in the data file. pub fn first_row_id(&self) -> Option { diff --git a/crates/iceberg/src/spec/manifest/mod.rs b/crates/iceberg/src/spec/manifest/mod.rs index da03977324..a1a5612c36 100644 --- a/crates/iceberg/src/spec/manifest/mod.rs +++ b/crates/iceberg/src/spec/manifest/mod.rs @@ -256,7 +256,7 @@ mod tests { snapshot_id: None, sequence_number: None, file_sequence_number: None, - data_file: DataFile {content:DataContentType::Data,file_path:"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),file_format:DataFileFormat::Parquet,partition:Struct::empty(),record_count:1,file_size_in_bytes:5442,column_sizes:HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),value_counts:HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]),null_value_counts:HashMap::from([(1,0),(6,0),(2,0),(8,0),(0,0),(3,0),(5,0),(9,0),(7,0),(4,0),(10,0)]),nan_value_counts:HashMap::new(),lower_bounds:HashMap::new(),upper_bounds:HashMap::new(),key_metadata:None,split_offsets:vec![4],equality_ids:Vec::new(),sort_order_id:None, partition_spec_id: 0,first_row_id: None,referenced_data_file: None,content_offset: None,content_size_in_bytes: None } + data_file: DataFile {content:DataContentType::Data,file_path:"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),file_format:DataFileFormat::Parquet,partition:Struct::empty(),record_count:1,file_size_in_bytes:5442,column_sizes:HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),value_counts:HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]),null_value_counts:HashMap::from([(1,0),(6,0),(2,0),(8,0),(0,0),(3,0),(5,0),(9,0),(7,0),(4,0),(10,0)]),nan_value_counts:HashMap::new(),lower_bounds:HashMap::new(),upper_bounds:HashMap::new(),key_metadata:None,split_offsets:vec![4],equality_ids:Some(Vec::new()),sort_order_id:None, partition_spec_id: 0,first_row_id: None,referenced_data_file: None,content_offset: None,content_size_in_bytes: None } } ]; @@ -435,7 +435,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![4], - equality_ids: vec![], + equality_ids: Some(Vec::new()), sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -532,7 +532,7 @@ mod tests { upper_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]), key_metadata: None, split_offsets: vec![4], - equality_ids: vec![], + equality_ids: None, sort_order_id: Some(0), partition_spec_id: 0, first_row_id: None, @@ -640,7 +640,7 @@ mod tests { ]), key_metadata: None, split_offsets: vec![4], - equality_ids: vec![], + equality_ids: None, sort_order_id: Some(0), partition_spec_id: 0, first_row_id: None, @@ -749,7 +749,7 @@ mod tests { ]), key_metadata: None, split_offsets: vec![4], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -840,7 +840,7 @@ mod tests { ]), key_metadata: None, split_offsets: vec![4], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -922,7 +922,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![4], - equality_ids: Vec::new(), + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -957,7 +957,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![4], - equality_ids: Vec::new(), + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -992,7 +992,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![4], - equality_ids: Vec::new(), + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -1027,7 +1027,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![4], - equality_ids: Vec::new(), + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -1182,7 +1182,7 @@ mod tests { "upper_bounds": [], "key_metadata": null, "split_offsets": [], - "equality_ids": [], + "equality_ids": null, "sort_order_id": null, "first_row_id": null, "referenced_data_file": null, @@ -1213,7 +1213,7 @@ mod tests { "upper_bounds": [], "key_metadata": null, "split_offsets": [], - "equality_ids": [], + "equality_ids": null, "sort_order_id": null, "first_row_id": null, "referenced_data_file": null, diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index 39945a513c..673f8b5d2a 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -545,7 +545,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: Some(Vec::new()), split_offsets: vec![4], - equality_ids: Vec::new(), + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -574,7 +574,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: Some(Vec::new()), split_offsets: vec![4], - equality_ids: Vec::new(), + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -603,7 +603,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: Some(Vec::new()), split_offsets: vec![4], - equality_ids: Vec::new(), + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, diff --git a/crates/iceberg/src/spec/snapshot_summary.rs b/crates/iceberg/src/spec/snapshot_summary.rs index e374e567ff..1b07ce3fa0 100644 --- a/crates/iceberg/src/spec/snapshot_summary.rs +++ b/crates/iceberg/src/spec/snapshot_summary.rs @@ -768,7 +768,7 @@ mod tests { ]), key_metadata: None, split_offsets: vec![4], - equality_ids: vec![], + equality_ids: None, sort_order_id: Some(0), partition_spec_id: 0, first_row_id: None, @@ -800,7 +800,7 @@ mod tests { ]), key_metadata: None, split_offsets: vec![4], - equality_ids: vec![], + equality_ids: None, sort_order_id: Some(0), partition_spec_id: 0, first_row_id: None, @@ -910,7 +910,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -938,7 +938,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, @@ -992,7 +992,7 @@ mod tests { upper_bounds: HashMap::new(), key_metadata: None, split_offsets: vec![], - equality_ids: vec![], + equality_ids: None, sort_order_id: None, partition_spec_id: 0, first_row_id: None, diff --git a/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs b/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs index f710544d9e..765ff1cacd 100644 --- a/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs +++ b/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs @@ -156,7 +156,7 @@ impl IcebergWriter for EqualityDeleteFileWriter { .into_iter() .map(|mut res| { res.content(crate::spec::DataContentType::EqualityDeletes); - res.equality_ids(self.equality_ids.iter().copied().collect_vec()); + res.equality_ids(Some(self.equality_ids.iter().copied().collect_vec())); res.partition(self.partition_value.clone()); res.partition_spec_id(self.partition_spec_id); res.build().expect("msg")