Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 150 additions & 0 deletions crates/iceberg/src/spec/manifest/_serde.rs
Original file line number Diff line number Diff line change
Expand Up @@ -432,4 +432,154 @@ mod tests {

assert_eq!(actual_data_file[0].content, DataContentType::Data)
}

#[test]
fn test_manifest_entry_v1_to_v2_projection() {
use crate::spec::manifest::_serde::{DataFileSerde, ManifestEntryV1};
use crate::spec::{Literal, RawLiteral, Struct, StructType};

let partition = RawLiteral::try_from(
Literal::Struct(Struct::empty()),
&Type::Struct(StructType::new(vec![])),
)
.unwrap();

// Create a V1 manifest entry struct (lacks V2 sequence number fields)
let v1_entry = ManifestEntryV1 {
status: 1, // Added
snapshot_id: 12345,
data_file: DataFileSerde {
content: 0, // DataFileSerde is shared between V1/V2
file_path: "test/path.parquet".to_string(),
file_format: "PARQUET".to_string(),
partition,
record_count: 100,
file_size_in_bytes: 1024,
block_size_in_bytes: Some(0), // V1 includes this field
column_sizes: None,
value_counts: None,
null_value_counts: None,
nan_value_counts: None,
lower_bounds: None,
upper_bounds: None,
key_metadata: None,
split_offsets: None,
equality_ids: None, // Will be converted to empty vec
sort_order_id: None,
first_row_id: None,
referenced_data_file: None,
content_offset: None,
content_size_in_bytes: None,
},
};

// Test the explicit V1→V2 conversion logic in ManifestEntryV1::try_into()
let v2_entry = v1_entry
.try_into(
0, // partition_spec_id
&StructType::new(vec![]),
&schema(),
)
.unwrap();

// Verify that V1→V2 conversion adds the missing V2 sequence number fields
assert_eq!(
v2_entry.sequence_number,
Some(0),
"ManifestEntryV1::try_into() should set sequence_number to 0"
);
assert_eq!(
v2_entry.file_sequence_number,
Some(0),
"ManifestEntryV1::try_into() should set file_sequence_number to 0"
);
assert_eq!(
v2_entry.snapshot_id,
Some(12345),
"snapshot_id should be preserved during conversion"
);

// Verify that DataFileSerde conversion applies V2 defaults
assert_eq!(
v2_entry.data_file.content,
DataContentType::Data,
"DataFileSerde should convert content 0 to DataContentType::Data"
);
assert_eq!(
v2_entry.data_file.equality_ids,
Vec::<i32>::new(),
"DataFileSerde should convert None equality_ids to empty vec"
);

// Verify other fields are preserved during conversion
assert_eq!(v2_entry.data_file.file_path, "test/path.parquet");
assert_eq!(v2_entry.data_file.record_count, 100);
assert_eq!(v2_entry.data_file.file_size_in_bytes, 1024);
}

#[test]
fn test_data_file_serde_v1_field_defaults() {
use crate::spec::manifest::_serde::DataFileSerde;
use crate::spec::{Literal, RawLiteral, Struct, StructType};

let partition = RawLiteral::try_from(
Literal::Struct(Struct::empty()),
&Type::Struct(StructType::new(vec![])),
)
.unwrap();

// Create a DataFileSerde that simulates V1 deserialization behavior
// (missing V2 fields would be None due to #[serde(default)])
let v1_style_data_file = DataFileSerde {
content: 0, // V1 doesn't have this field, defaults to 0 via #[serde(default)]
file_path: "test/data.parquet".to_string(),
file_format: "PARQUET".to_string(),
partition,
record_count: 500,
file_size_in_bytes: 2048,
block_size_in_bytes: Some(1024), // V1 includes this field, V2 skips it
column_sizes: None,
value_counts: None,
null_value_counts: None,
nan_value_counts: None,
lower_bounds: None,
upper_bounds: None,
key_metadata: None,
split_offsets: None,
equality_ids: None, // V1 doesn't have this field, defaults to None via #[serde(default)]
sort_order_id: None,
first_row_id: None,
referenced_data_file: None,
content_offset: None,
content_size_in_bytes: None,
};

// Test the DataFileSerde::try_into() conversion that handles V1 field defaults
let data_file = v1_style_data_file
.try_into(
0, // partition_spec_id
&StructType::new(vec![]),
&schema(),
)
.unwrap();

// Verify that DataFileSerde::try_into() applies correct defaults for missing V2 fields
assert_eq!(
data_file.content,
DataContentType::Data,
"content 0 should convert to DataContentType::Data"
);
assert_eq!(
data_file.equality_ids,
Vec::<i32>::new(),
"None equality_ids should convert to empty vec via unwrap_or_default()"
);

// Verify other fields are handled correctly during conversion
assert_eq!(data_file.file_path, "test/data.parquet");
assert_eq!(data_file.file_format, DataFileFormat::Parquet);
assert_eq!(data_file.record_count, 500);
assert_eq!(data_file.file_size_in_bytes, 2048);
assert_eq!(data_file.partition_spec_id, 0);
}
}
17 changes: 16 additions & 1 deletion crates/iceberg/src/spec/manifest/data_file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -333,9 +333,10 @@ pub fn read_data_files_from_avro<R: Read>(

/// Type of content stored by the data file: data, equality deletes, or
/// position deletes (all v1 files are data files)
#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize, Default)]
pub enum DataContentType {
/// value: 0
#[default]
Data = 0,
/// value: 1
PositionDeletes = 1,
Expand Down Expand Up @@ -399,3 +400,17 @@ impl std::fmt::Display for DataFileFormat {
}
}
}

#[cfg(test)]
mod test {
use crate::spec::DataContentType;
#[test]
fn test_data_content_type_default() {
assert_eq!(DataContentType::default(), DataContentType::Data);
}

#[test]
fn test_data_content_type_default_value() {
assert_eq!(DataContentType::default() as i32, 0);
}
}
66 changes: 65 additions & 1 deletion crates/iceberg/src/spec/manifest_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -600,9 +600,10 @@ impl ManifestFile {
}

/// The type of files tracked by the manifest, either data or delete files; Data(0) for all v1 manifests
#[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
#[derive(Debug, PartialEq, Clone, Copy, Eq, Hash, Default)]
pub enum ManifestContentType {
/// The manifest content is data.
#[default]
Data = 0,
/// The manifest content is deletes.
Deletes = 1,
Expand Down Expand Up @@ -1357,4 +1358,67 @@ mod test {
};
fields
}

#[test]
fn test_manifest_content_type_default() {
assert_eq!(ManifestContentType::default(), ManifestContentType::Data);
}

#[test]
fn test_manifest_content_type_default_value() {
assert_eq!(ManifestContentType::default() as i32, 0);
}

#[test]
fn test_manifest_file_v1_to_v2_projection() {
use crate::spec::manifest_list::_serde::ManifestFileV1;

// Create a V1 manifest file object (without V2 fields)
let v1_manifest = ManifestFileV1 {
manifest_path: "/test/manifest.avro".to_string(),
manifest_length: 5806,
partition_spec_id: 0,
added_snapshot_id: 1646658105718557341,
added_data_files_count: Some(3),
existing_data_files_count: Some(0),
deleted_data_files_count: Some(0),
added_rows_count: Some(3),
existing_rows_count: Some(0),
deleted_rows_count: Some(0),
partitions: None,
key_metadata: None,
};

// Convert V1 to V2 - this should apply defaults for missing V2 fields
let v2_manifest: ManifestFile = v1_manifest.try_into().unwrap();

// Verify V1→V2 projection defaults are applied correctly
assert_eq!(
v2_manifest.content,
ManifestContentType::Data,
"V1 manifest content should default to Data (0)"
);
assert_eq!(
v2_manifest.sequence_number, 0,
"V1 manifest sequence_number should default to 0"
);
assert_eq!(
v2_manifest.min_sequence_number, 0,
"V1 manifest min_sequence_number should default to 0"
);

// Verify other fields are preserved correctly
assert_eq!(v2_manifest.manifest_path, "/test/manifest.avro");
assert_eq!(v2_manifest.manifest_length, 5806);
assert_eq!(v2_manifest.partition_spec_id, 0);
assert_eq!(v2_manifest.added_snapshot_id, 1646658105718557341);
assert_eq!(v2_manifest.added_files_count, Some(3));
assert_eq!(v2_manifest.existing_files_count, Some(0));
assert_eq!(v2_manifest.deleted_files_count, Some(0));
assert_eq!(v2_manifest.added_rows_count, Some(3));
assert_eq!(v2_manifest.existing_rows_count, Some(0));
assert_eq!(v2_manifest.deleted_rows_count, Some(0));
assert_eq!(v2_manifest.partitions, None);
assert_eq!(v2_manifest.key_metadata, None);
}
}
90 changes: 90 additions & 0 deletions crates/iceberg/src/spec/snapshot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -435,4 +435,94 @@ mod tests {
);
assert_eq!("s3://b/wh/.../s1.avro".to_string(), *result.manifest_list());
}

#[test]
fn test_snapshot_v1_to_v2_projection() {
use crate::spec::snapshot::_serde::SnapshotV1;

// Create a V1 snapshot (without sequence-number field)
let v1_snapshot = SnapshotV1 {
snapshot_id: 1234567890,
parent_snapshot_id: Some(987654321),
timestamp_ms: 1515100955770,
manifest_list: Some("s3://bucket/manifest-list.avro".to_string()),
manifests: None, // V1 can have either manifest_list or manifests, but not both
summary: Some(Summary {
operation: Operation::Append,
additional_properties: HashMap::from([
("added-files".to_string(), "5".to_string()),
("added-records".to_string(), "100".to_string()),
]),
}),
schema_id: Some(1),
};

// Convert V1 to V2 - this should apply defaults for missing V2 fields
let v2_snapshot: Snapshot = v1_snapshot.try_into().unwrap();

// Verify V1→V2 projection defaults are applied correctly
assert_eq!(
v2_snapshot.sequence_number(),
0,
"V1 snapshot sequence_number should default to 0"
);

// Verify other fields are preserved correctly during conversion
assert_eq!(v2_snapshot.snapshot_id(), 1234567890);
assert_eq!(v2_snapshot.parent_snapshot_id(), Some(987654321));
assert_eq!(v2_snapshot.timestamp_ms(), 1515100955770);
assert_eq!(
v2_snapshot.manifest_list(),
"s3://bucket/manifest-list.avro"
);
assert_eq!(v2_snapshot.schema_id(), Some(1));
assert_eq!(v2_snapshot.summary().operation, Operation::Append);
assert_eq!(
v2_snapshot
.summary()
.additional_properties
.get("added-files"),
Some(&"5".to_string())
);
}

#[test]
fn test_snapshot_v1_to_v2_with_missing_summary() {
use crate::spec::snapshot::_serde::SnapshotV1;

// Create a V1 snapshot without summary (should get default)
let v1_snapshot = SnapshotV1 {
snapshot_id: 1111111111,
parent_snapshot_id: None,
timestamp_ms: 1515100955770,
manifest_list: Some("s3://bucket/manifest-list.avro".to_string()),
manifests: None,
summary: None, // V1 summary is optional
schema_id: None,
};

// Convert V1 to V2 - this should apply default summary
let v2_snapshot: Snapshot = v1_snapshot.try_into().unwrap();

// Verify defaults are applied correctly
assert_eq!(
v2_snapshot.sequence_number(),
0,
"V1 snapshot sequence_number should default to 0"
);
assert_eq!(
v2_snapshot.summary().operation,
Operation::Append,
"Missing V1 summary should default to Append operation"
);
assert!(
v2_snapshot.summary().additional_properties.is_empty(),
"Default summary should have empty additional_properties"
);

// Verify other fields
assert_eq!(v2_snapshot.snapshot_id(), 1111111111);
assert_eq!(v2_snapshot.parent_snapshot_id(), None);
assert_eq!(v2_snapshot.schema_id(), None);
}
}
Loading