From a55c543a5c7f1fda0ab0c95b013e0aba28c8e6b2 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 23 Sep 2025 09:28:03 -0700 Subject: [PATCH 1/2] [Variant] VariantArray::value can return owned bytes now --- parquet-variant-compute/src/lib.rs | 2 +- parquet-variant-compute/src/shred_variant.rs | 4 +- parquet-variant-compute/src/variant_array.rs | 150 +++++++++++++----- .../src/variant_array_builder.rs | 3 +- parquet-variant-compute/src/variant_get.rs | 20 +-- parquet-variant/src/variant.rs | 6 +- 6 files changed, 134 insertions(+), 51 deletions(-) diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index 496d550d95b1..d52d921247e1 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -46,7 +46,7 @@ mod variant_array_builder; pub mod variant_get; mod variant_to_arrow; -pub use variant_array::{ShreddingState, VariantArray, VariantType}; +pub use variant_array::{ShreddingState, VariantArray, VariantArrayValue, VariantType}; pub use variant_array_builder::{VariantArrayBuilder, VariantValueArrayBuilder}; pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options}; diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index 138209802ab4..0408ad66183d 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -86,7 +86,9 @@ pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result { + Borrowed(Variant<'m, 'v>), + Owned { + metadata: VariantMetadata<'m>, + value_bytes: Vec, + }, +} + +impl<'m, 'v> VariantArrayValue<'m, 'v> { + pub fn borrowed(value: Variant<'m, 'v>) -> Self { + Self::Borrowed(value) + } + pub fn owned(metadata_bytes: &'m [u8], value_bytes: Vec) -> Self { + Self::Owned { + metadata: VariantMetadata::new(metadata_bytes), + value_bytes, + } + } + pub fn consume(self, visitor: impl FnOnce(Variant<'_, '_>) -> R) -> R { + match self { + VariantArrayValue::Borrowed(v) => visitor(v), + VariantArrayValue::Owned { + metadata, + value_bytes, + } => visitor(Variant::new_with_metadata(metadata, &value_bytes)), + } + } + // internal helper for when we don't want to pay the extra clone + fn as_variant_cow(&self) -> Cow<'_, Variant<'m, '_>> { + match self { + VariantArrayValue::Borrowed(v) => Cow::Borrowed(v), + VariantArrayValue::Owned { + metadata, + value_bytes, + } => Cow::Owned(Variant::new_with_metadata(metadata.clone(), value_bytes)), + } + } + pub fn as_variant(&self) -> Variant<'m, '_> { + self.as_variant_cow().into_owned() + } + pub fn metadata(&self) -> &VariantMetadata<'m> { + match self { + VariantArrayValue::Borrowed(v) => v.metadata(), + VariantArrayValue::Owned { metadata, .. } => metadata, + } + } + pub fn as_object(&self) -> Option> { + self.as_variant_cow().as_object().cloned() + } + pub fn as_list(&self) -> Option> { + self.as_variant_cow().as_list().cloned() + } + pub fn get_object_field<'s>(&'s self, field_name: &str) -> Option> { + self.as_variant_cow().get_object_field(field_name) + } + pub fn get_list_element(&self, index: usize) -> Option> { + self.as_variant_cow().get_list_element(index) + } +} + +impl<'m, 'v> From> for VariantArrayValue<'m, 'v> { + fn from(value: Variant<'m, 'v>) -> Self { + Self::borrowed(value) + } +} + +impl PartialEq for VariantArrayValue<'_, '_> { + fn eq(&self, other: &VariantArrayValue<'_, '_>) -> bool { + self.as_variant_cow().as_ref() == other.as_variant_cow().as_ref() + } +} + +impl PartialEq> for VariantArrayValue<'_, '_> { + fn eq(&self, other: &Variant<'_, '_>) -> bool { + self.as_variant_cow().as_ref() == other + } +} + +impl PartialEq> for Variant<'_, '_> { + fn eq(&self, other: &VariantArrayValue<'_, '_>) -> bool { + self == other.as_variant_cow().as_ref() + } +} + +impl std::fmt::Debug for VariantArrayValue<'_, '_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.as_variant_cow().fmt(f) + } +} + /// An array of Parquet [`Variant`] values /// /// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying @@ -352,39 +444,24 @@ impl VariantArray { /// /// Note: Does not do deep validation of the [`Variant`], so it is up to the /// caller to ensure that the metadata and value were constructed correctly. - pub fn value(&self, index: usize) -> Variant<'_, '_> { - match &self.shredding_state { - ShreddingState::Unshredded { value, .. } => { - // Unshredded case - Variant::new(self.metadata.value(index), value.value(index)) + pub fn value(&self, index: usize) -> VariantArrayValue<'_, '_> { + let value = match &self.shredding_state { + // Always prefer to use the typed_value, if present + ShreddingState::Typed { typed_value, .. } + | ShreddingState::PartiallyShredded { typed_value, .. } + if typed_value.is_valid(index) => + { + return typed_value_to_variant(typed_value, index); } - ShreddingState::Typed { typed_value, .. } => { - // Typed case (formerly PerfectlyShredded) - if typed_value.is_null(index) { - Variant::Null - } else { - typed_value_to_variant(typed_value, index) - } - } - ShreddingState::PartiallyShredded { - value, typed_value, .. - } => { - // PartiallyShredded case (formerly ImperfectlyShredded) - if typed_value.is_null(index) { - Variant::new(self.metadata.value(index), value.value(index)) - } else { - typed_value_to_variant(typed_value, index) - } - } - ShreddingState::AllNull => { - // AllNull case: neither value nor typed_value fields exist - // NOTE: This handles the case where neither value nor typed_value fields exist. - // For top-level variants, this returns Variant::Null (JSON null). - // For shredded object fields, this technically should indicate SQL NULL, - // but the current API cannot distinguish these contexts. - Variant::Null + // If no typed_value, fall back to value, if present + ShreddingState::Unshredded { value, .. } + | ShreddingState::PartiallyShredded { value, .. } => { + Variant::new(self.metadata.value(index), value.value(index)) } - } + // If neither value nor typed_value fields exist, return Variant::Null. + ShreddingState::Typed { .. } | ShreddingState::AllNull => Variant::Null, + }; + value.into() } /// Return a reference to the metadata field of the [`StructArray`] @@ -796,8 +873,8 @@ impl StructArrayBuilder { } /// returns the non-null element at index as a Variant -fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, '_> { - match typed_value.data_type() { +fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> VariantArrayValue<'_, '_> { + let value = match typed_value.data_type() { DataType::Boolean => { let boolean_array = typed_value.as_boolean(); let value = boolean_array.value(index); @@ -815,7 +892,7 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, ' let value = array.value(index); if *binary_len == 16 { if let Ok(uuid) = Uuid::from_slice(value) { - return Variant::from(uuid); + return Variant::from(uuid).into(); } } let value = array.value(index); @@ -877,7 +954,8 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, ' ); Variant::Null } - } + }; + value.into() } /// Workaround for lack of direct support for BinaryArray diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index 68c1fd6b5492..cbd264384778 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -485,7 +485,7 @@ mod test { let mut builder = VariantValueArrayBuilder::new(3); // straight copy - builder.append_value(array.value(0)); + array.value(0).consume(|value| builder.append_value(value)); // filtering fields takes more work because we need to manually create an object builder let value = array.value(1); @@ -498,6 +498,7 @@ mod test { // same bytes, but now nested and duplicated inside a list let value = array.value(2); + let value = value.as_variant(); let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().clone()); let state = builder.parent_state(&mut metadata_builder); ListBuilder::new(state, false) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index ef602e84f1bf..428e3f306a68 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -144,7 +144,9 @@ fn shredded_get_path( if target.is_null(i) { builder.append_null()?; } else { - builder.append_value(target.value(i))?; + target + .value(i) + .consume(|value| builder.append_value(value))?; } } builder.finish() @@ -1596,7 +1598,7 @@ mod test { let json_str = r#"{"x": 42}"#; let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); if let Ok(variant_array) = json_to_variant(&string_array) { - builder.append_variant(variant_array.value(0)); + variant_array.value(0).consume(|value| builder.append_variant(value)); } else { builder.append_null(); } @@ -1607,7 +1609,7 @@ mod test { let json_str = r#"{"x": "foo"}"#; let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); if let Ok(variant_array) = json_to_variant(&string_array) { - builder.append_variant(variant_array.value(0)); + variant_array.value(0).consume(|value| builder.append_variant(value)); } else { builder.append_null(); } @@ -1618,7 +1620,7 @@ mod test { let json_str = r#"{"y": 10}"#; let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); if let Ok(variant_array) = json_to_variant(&string_array) { - builder.append_variant(variant_array.value(0)); + variant_array.value(0).consume(|value| builder.append_variant(value)); } else { builder.append_null(); } @@ -1637,7 +1639,7 @@ mod test { let json_str = r#"{"a": {"x": 55}, "b": 42}"#; let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); if let Ok(variant_array) = json_to_variant(&string_array) { - builder.append_variant(variant_array.value(0)); + variant_array.value(0).consume(|value| builder.append_variant(value)); } else { builder.append_null(); } @@ -1648,7 +1650,7 @@ mod test { let json_str = r#"{"a": {"x": "foo"}, "b": 42}"#; let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); if let Ok(variant_array) = json_to_variant(&string_array) { - builder.append_variant(variant_array.value(0)); + variant_array.value(0).consume(|value| builder.append_variant(value)); } else { builder.append_null(); } @@ -1667,7 +1669,7 @@ mod test { let json_str = r#"{"a": {"b": {"x": 100}}}"#; let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); if let Ok(variant_array) = json_to_variant(&string_array) { - builder.append_variant(variant_array.value(0)); + variant_array.value(0).consume(|value| builder.append_variant(value)); } else { builder.append_null(); } @@ -1678,7 +1680,7 @@ mod test { let json_str = r#"{"a": {"b": {"x": "bar"}}}"#; let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); if let Ok(variant_array) = json_to_variant(&string_array) { - builder.append_variant(variant_array.value(0)); + variant_array.value(0).consume(|value| builder.append_variant(value)); } else { builder.append_null(); } @@ -1689,7 +1691,7 @@ mod test { let json_str = r#"{"a": {"b": {"y": 200}}}"#; let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); if let Ok(variant_array) = json_to_variant(&string_array) { - builder.append_variant(variant_array.value(0)); + variant_array.value(0).consume(|value| builder.append_variant(value)); } else { builder.append_null(); } diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 849947675b13..1db20fa776ef 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -1218,7 +1218,7 @@ impl<'m, 'v> Variant<'m, 'v> { /// let obj = variant.as_object().expect("variant should be an object"); /// assert_eq!(obj.get("name"), Some(Variant::from("John"))); /// ``` - pub fn as_object(&'m self) -> Option<&'m VariantObject<'m, 'v>> { + pub fn as_object(&self) -> Option<&VariantObject<'m, 'v>> { if let Variant::Object(obj) = self { Some(obj) } else { @@ -1280,7 +1280,7 @@ impl<'m, 'v> Variant<'m, 'v> { /// assert_eq!(list.get(0).unwrap(), Variant::from("John")); /// assert_eq!(list.get(1).unwrap(), Variant::from("Doe")); /// ``` - pub fn as_list(&'m self) -> Option<&'m VariantList<'m, 'v>> { + pub fn as_list(&self) -> Option<&VariantList<'m, 'v>> { if let Variant::List(list) = self { Some(list) } else { @@ -1308,7 +1308,7 @@ impl<'m, 'v> Variant<'m, 'v> { /// let v2 = Variant::from("Hello"); /// assert_eq!(None, v2.as_time_utc()); /// ``` - pub fn as_time_utc(&'m self) -> Option { + pub fn as_time_utc(&self) -> Option { if let Variant::Time(time) = self { Some(*time) } else { From 79b31811dc79ee003574a76d339d15b733c9af65 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 23 Sep 2025 12:40:46 -0700 Subject: [PATCH 2/2] self review + fmt --- parquet-variant-compute/src/variant_array.rs | 37 +++++++++++++++++++- parquet-variant-compute/src/variant_get.rs | 32 ++++++++++++----- 2 files changed, 60 insertions(+), 9 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 48f66fca2ffe..c6c27322803a 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -74,6 +74,9 @@ impl ExtensionType for VariantType { } } +/// A [`Cow`]-like representation of a [`Variant`] value returned by [`VariantArray::value`], which +/// may use owned or borrowed value bytes depending on how the underlying variant was shredded. We +/// cannot "just" use [`Cow`] because of the special lifetime management that [`Variant`] requires. pub enum VariantArrayValue<'m, 'v> { Borrowed(Variant<'m, 'v>), Owned { @@ -83,15 +86,23 @@ pub enum VariantArrayValue<'m, 'v> { } impl<'m, 'v> VariantArrayValue<'m, 'v> { + /// Creates a new instance that borrows from a normal [`Variant`] value. pub fn borrowed(value: Variant<'m, 'v>) -> Self { Self::Borrowed(value) } + + /// Creates a new instance that wraps owned bytes that can be converted to a [`Variant`] value. pub fn owned(metadata_bytes: &'m [u8], value_bytes: Vec) -> Self { Self::Owned { metadata: VariantMetadata::new(metadata_bytes), value_bytes, } } + + /// Consumes this variant value, passing the result to a `visitor` function. + /// + /// The visitor idiom is helpful because a variant value based on owned bytes cannot outlive + /// self. pub fn consume(self, visitor: impl FnOnce(Variant<'_, '_>) -> R) -> R { match self { VariantArrayValue::Borrowed(v) => visitor(v), @@ -101,6 +112,7 @@ impl<'m, 'v> VariantArrayValue<'m, 'v> { } => visitor(Variant::new_with_metadata(metadata, &value_bytes)), } } + // internal helper for when we don't want to pay the extra clone fn as_variant_cow(&self) -> Cow<'_, Variant<'m, '_>> { match self { @@ -111,24 +123,44 @@ impl<'m, 'v> VariantArrayValue<'m, 'v> { } => Cow::Owned(Variant::new_with_metadata(metadata.clone(), value_bytes)), } } + + /// Returns a [`Variant`] instance for this value. pub fn as_variant(&self) -> Variant<'m, '_> { self.as_variant_cow().into_owned() } + + /// Returns the variant metadata that backs this value. pub fn metadata(&self) -> &VariantMetadata<'m> { match self { VariantArrayValue::Borrowed(v) => v.metadata(), VariantArrayValue::Owned { metadata, .. } => metadata, } } + + /// Extracts the underlying [`VariantObject`], if this is a variant object. + /// + /// See also [`Variant::as_object`]. pub fn as_object(&self) -> Option> { self.as_variant_cow().as_object().cloned() } + + /// Extracts the underlying [`VariantList`], if this is a variant array. + /// + /// See also [`Variant::as_list`]. pub fn as_list(&self) -> Option> { self.as_variant_cow().as_list().cloned() } + + /// Extracts the value of the named variant object field, if this is a variant object. + /// + /// See also [`Variant::get_object_field`]. pub fn get_object_field<'s>(&'s self, field_name: &str) -> Option> { self.as_variant_cow().get_object_field(field_name) } + + /// Extracts the value of the variant array element at `index`, if this is a variant object. + /// + /// See also [`Variant::get_list_element`]. pub fn get_list_element(&self, index: usize) -> Option> { self.as_variant_cow().get_list_element(index) } @@ -140,6 +172,8 @@ impl<'m, 'v> From> for VariantArrayValue<'m, 'v> { } } +// By providing PartialEq for all three combinations, we avoid changing a lot of unit test code that +// relies on comparisons. impl PartialEq for VariantArrayValue<'_, '_> { fn eq(&self, other: &VariantArrayValue<'_, '_>) -> bool { self.as_variant_cow().as_ref() == other.as_variant_cow().as_ref() @@ -158,9 +192,10 @@ impl PartialEq> for Variant<'_, '_> { } } +// Make it transparent -- looks just like the underlying value it proxies impl std::fmt::Debug for VariantArrayValue<'_, '_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.as_variant_cow().fmt(f) + self.as_variant_cow().as_ref().fmt(f) } } diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 428e3f306a68..47029184ba01 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -1598,7 +1598,9 @@ mod test { let json_str = r#"{"x": 42}"#; let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); if let Ok(variant_array) = json_to_variant(&string_array) { - variant_array.value(0).consume(|value| builder.append_variant(value)); + variant_array + .value(0) + .consume(|value| builder.append_variant(value)); } else { builder.append_null(); } @@ -1609,7 +1611,9 @@ mod test { let json_str = r#"{"x": "foo"}"#; let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); if let Ok(variant_array) = json_to_variant(&string_array) { - variant_array.value(0).consume(|value| builder.append_variant(value)); + variant_array + .value(0) + .consume(|value| builder.append_variant(value)); } else { builder.append_null(); } @@ -1620,7 +1624,9 @@ mod test { let json_str = r#"{"y": 10}"#; let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); if let Ok(variant_array) = json_to_variant(&string_array) { - variant_array.value(0).consume(|value| builder.append_variant(value)); + variant_array + .value(0) + .consume(|value| builder.append_variant(value)); } else { builder.append_null(); } @@ -1639,7 +1645,9 @@ mod test { let json_str = r#"{"a": {"x": 55}, "b": 42}"#; let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); if let Ok(variant_array) = json_to_variant(&string_array) { - variant_array.value(0).consume(|value| builder.append_variant(value)); + variant_array + .value(0) + .consume(|value| builder.append_variant(value)); } else { builder.append_null(); } @@ -1650,7 +1658,9 @@ mod test { let json_str = r#"{"a": {"x": "foo"}, "b": 42}"#; let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); if let Ok(variant_array) = json_to_variant(&string_array) { - variant_array.value(0).consume(|value| builder.append_variant(value)); + variant_array + .value(0) + .consume(|value| builder.append_variant(value)); } else { builder.append_null(); } @@ -1669,7 +1679,9 @@ mod test { let json_str = r#"{"a": {"b": {"x": 100}}}"#; let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); if let Ok(variant_array) = json_to_variant(&string_array) { - variant_array.value(0).consume(|value| builder.append_variant(value)); + variant_array + .value(0) + .consume(|value| builder.append_variant(value)); } else { builder.append_null(); } @@ -1680,7 +1692,9 @@ mod test { let json_str = r#"{"a": {"b": {"x": "bar"}}}"#; let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); if let Ok(variant_array) = json_to_variant(&string_array) { - variant_array.value(0).consume(|value| builder.append_variant(value)); + variant_array + .value(0) + .consume(|value| builder.append_variant(value)); } else { builder.append_null(); } @@ -1691,7 +1705,9 @@ mod test { let json_str = r#"{"a": {"b": {"y": 200}}}"#; let string_array: ArrayRef = Arc::new(StringArray::from(vec![json_str])); if let Ok(variant_array) = json_to_variant(&string_array) { - variant_array.value(0).consume(|value| builder.append_variant(value)); + variant_array + .value(0) + .consume(|value| builder.append_variant(value)); } else { builder.append_null(); }