From c9bbc44b8e66635cbc63e059e9b8877694cd7e0d Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Thu, 2 Oct 2025 06:17:04 -0700 Subject: [PATCH 01/17] feat(parquet): arrow sbbf --- parquet/src/arrow/bloom_filter.rs | 727 ++++++++++++++++++++++++++++++ parquet/src/arrow/mod.rs | 1 + 2 files changed, 728 insertions(+) create mode 100644 parquet/src/arrow/bloom_filter.rs diff --git a/parquet/src/arrow/bloom_filter.rs b/parquet/src/arrow/bloom_filter.rs new file mode 100644 index 000000000000..7e2f041e66f5 --- /dev/null +++ b/parquet/src/arrow/bloom_filter.rs @@ -0,0 +1,727 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Arrow-aware bloom filter +//! +//! This module provides [`ArrowSbbf`], a wrapper around [`Sbbf`] that handles +//! Arrow-to-Parquet type coercion when checking bloom filters. +//! +//! # Overview +//! +//! Arrow supports types like `Int8` and `Int16` that have no corresponding Parquet type. +//! Data of these types must be coerced to the appropriate Parquet type when writing. +//! +//! For example, Arrow small integer types are coerced writing to Parquet: +//! - `Int8` → `INT32` (i8 → i32) +//! - `Int16` → `INT32` (i16 → i32) +//! - `UInt8` → `INT32` (u8 → u32 → i32) +//! - `UInt16` → `INT32` (u16 → u32 → i32) +//! +//! Decimal types with small precision are also coerced: +//! - `Decimal32/64/128/256(precision 1-9)` → `INT32` (truncated via `as i32`) +//! - `Decimal64/128/256(precision 10-18)` → `INT64` (truncated via `as i64`) +//! +//! In these situations, bloom filters store hashes of the *physical* Parquet values, but Arrow users +//! will often need to check using the *logical* Arrow values. +//! +//! [`ArrowSbbf`] wraps an [`Sbbf`] and an Arrow [`DataType`], automatically coercing +//! values to their Parquet representation before checking the bloom filter. +//! +//! # Example +//! +//! ```ignore +//! use arrow_schema::DataType; +//! use parquet::arrow::bloom_filter::ArrowSbbf; +//! +//! // Get bloom filter from row group reader +//! let sbbf = row_group_reader.get_column_bloom_filter(0)?; +//! let arrow_sbbf = ArrowSbbf::new(&sbbf, &DataType::Int8); +//! +//! // Check with i8 value - automatically coerced to i32 +//! let value: i8 = 42; +//! if arrow_sbbf.check(&value) { +//! println!("Value might be present"); +//! } +//! ``` +//! +//! # Known Limitations +//! +//! **Date64 with coerce_types=true**: When [`WriterProperties::set_coerce_types`] is enabled, +//! `Date64` values (i64 milliseconds) are coerced to `Date32` (i32 days). Currently, +//! [`ArrowSbbf`] cannot detect this case and will produce false negatives when checking +//! `Date64` values against bloom filters created with `coerce_types=true`. +//! +//! Workaround: Manually convert milliseconds to days before checking: +//! ```ignore +//! let ms = 864_000_000_i64; // 10 days in milliseconds +//! let days = (ms / 86_400_000) as i32; +//! arrow_sbbf.check(&days); // Check with i32 days instead +//! ``` +//! +//! [`WriterProperties::set_coerce_types`]: crate::file::properties::WriterPropertiesBuilder::set_coerce_types + +use crate::bloom_filter::Sbbf; +use crate::data_type::AsBytes; +use arrow_schema::DataType; + +/// Wraps an [`Sbbf`] and provides automatic type coercion based on Arrow schema. +/// Ensures that checking bloom filters works correctly for Arrow types that require +/// coercion to Parquet physical types (e.g., Int8 → INT32). +#[derive(Debug, Clone)] +pub struct ArrowSbbf<'a> { + sbbf: &'a Sbbf, + arrow_type: &'a DataType, +} + +impl<'a> ArrowSbbf<'a> { + /// Create a new Arrow-aware bloom filter wrapper + /// * `sbbf` - Parquet bloom filter for the column + /// * `arrow_type` - Arrow data type for the column + pub fn new(sbbf: &'a Sbbf, arrow_type: &'a DataType) -> Self { + Self { sbbf, arrow_type } + } + + /// Check if a value might be present in the bloom filter + /// Automatically handles type coercion based on the Arrow data type. + /// + /// Returns `true` if the value might be present (may have false positives), + /// or `false` if the value is definitely not present. + pub fn check(&self, value: &T) -> bool { + match self.arrow_type { + DataType::Int8 => { + // Arrow Int8 -> Parquet INT32 + let bytes = value.as_bytes(); + if bytes.len() == 1 { + let i8_val = i8::from_le_bytes([bytes[0]]); + let i32_val = i8_val as i32; + self.sbbf.check(&i32_val) + } else { + // Unexpected size, fall back to direct check + self.sbbf.check(value) + } + } + DataType::Int16 => { + // Arrow Int16 -> Parquet INT32 + let bytes = value.as_bytes(); + if bytes.len() == 2 { + let i16_val = i16::from_le_bytes([bytes[0], bytes[1]]); + let i32_val = i16_val as i32; + self.sbbf.check(&i32_val) + } else { + // Unexpected size, fall back to direct check + self.sbbf.check(value) + } + } + DataType::UInt8 => { + // Arrow UInt8 -> Parquet INT32 + let bytes = value.as_bytes(); + if bytes.len() == 1 { + let u8_val = bytes[0]; + let u32_val = u8_val as u32; + let i32_val = u32_val as i32; + self.sbbf.check(&i32_val) + } else { + // Unexpected size, fall back to direct check + self.sbbf.check(value) + } + } + DataType::UInt16 => { + // Arrow UInt16 -> Parquet INT32 + let bytes = value.as_bytes(); + if bytes.len() == 2 { + let u16_val = u16::from_le_bytes([bytes[0], bytes[1]]); + let u32_val = u16_val as u32; + let i32_val = u32_val as i32; + self.sbbf.check(&i32_val) + } else { + // Unexpected size, fall back to direct check + self.sbbf.check(value) + } + } + DataType::Decimal32(precision, _) + | DataType::Decimal64(precision, _) + | DataType::Decimal128(precision, _) + | DataType::Decimal256(precision, _) + if *precision >= 1 && *precision <= 9 => + { + // Decimal with precision 1-9 -> Parquet INT32 + // Writer truncates via `as i32` + let bytes = value.as_bytes(); + match bytes.len() { + 4 => { + // Decimal32: i32 value, directly reinterpret + let i32_val = i32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]); + self.sbbf.check(&i32_val) + } + 8 => { + // Decimal64: i64 value, truncate to i32 + let i64_val = i64::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], + bytes[7], + ]); + let i32_val = i64_val as i32; + self.sbbf.check(&i32_val) + } + 16 => { + // Decimal128: i128 value, truncate to i32 + let i128_val = i128::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], + bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], + bytes[13], bytes[14], bytes[15], + ]); + let i32_val = i128_val as i32; + self.sbbf.check(&i32_val) + } + 32 => { + // Decimal256: i256 stored as 32 bytes, truncate to i32 + // Read first 16 bytes as i128, then truncate + let i128_val = i128::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], + bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], + bytes[13], bytes[14], bytes[15], + ]); + let i32_val = i128_val as i32; + self.sbbf.check(&i32_val) + } + _ => self.sbbf.check(value), + } + } + DataType::Decimal64(precision, _) + | DataType::Decimal128(precision, _) + | DataType::Decimal256(precision, _) + if *precision >= 10 && *precision <= 18 => + { + // Decimal with precision 10-18 -> Parquet INT64 + // Writer truncates via `as i64` + let bytes = value.as_bytes(); + match bytes.len() { + 8 => { + // Decimal64: i64 value, directly use + let i64_val = i64::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], + bytes[7], + ]); + self.sbbf.check(&i64_val) + } + 16 => { + // Decimal128: i128 value, truncate to i64 + let i128_val = i128::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], + bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], + bytes[13], bytes[14], bytes[15], + ]); + let i64_val = i128_val as i64; + self.sbbf.check(&i64_val) + } + 32 => { + // Decimal256: i256 stored as 32 bytes, truncate to i64 + // Read first 16 bytes as i128, then truncate + let i128_val = i128::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], + bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], + bytes[13], bytes[14], bytes[15], + ]); + let i64_val = i128_val as i64; + self.sbbf.check(&i64_val) + } + _ => self.sbbf.check(value), + } + } + // No coercion needed + _ => self.sbbf.check(value), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::arrow::ArrowWriter; + use crate::file::properties::{ReaderProperties, WriterProperties}; + use crate::file::reader::{FileReader, SerializedFileReader}; + use crate::file::serialized_reader::ReadOptionsBuilder; + use arrow_array::{ + ArrayRef, Date64Array, Decimal128Array, Decimal32Array, Float16Array, Int16Array, + Int8Array, RecordBatch, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + }; + use arrow_schema::{DataType, Field, Schema}; + use std::sync::Arc; + use tempfile::tempfile; + + /// Helper function to build a bloom filter for testing + /// + /// Writes the given array to a Parquet file with bloom filters enabled, + /// then reads it back and returns the bloom filter for the first column. + fn build_sbbf(array: ArrayRef, field: Field) -> Sbbf { + let schema = Arc::new(Schema::new(vec![field.clone()])); + let batch = RecordBatch::try_new(schema.clone(), vec![array]).unwrap(); + + // Write with bloom filter enabled + let mut file = tempfile().unwrap(); + let props = WriterProperties::builder() + .set_bloom_filter_enabled(true) + .build(); + + let mut writer = ArrowWriter::try_new(&mut file, schema, Some(props)).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + // Read back with bloom filters enabled + let options = ReadOptionsBuilder::new() + .with_reader_properties( + ReaderProperties::builder() + .set_read_bloom_filter(true) + .build(), + ) + .build(); + let reader = SerializedFileReader::new_with_options(file, options).unwrap(); + + // Get and return the bloom filter + let row_group_reader = reader.get_row_group(0).unwrap(); + row_group_reader + .get_column_bloom_filter(0) + .expect("Bloom filter should exist") + .clone() + } + + #[test] + fn test_int8_coercion() { + // Int8 -> Parquet INT32 (sign extension changes bytes) + let test_value = 3_i8; + let array = Arc::new(Int8Array::from(vec![1_i8, 2, test_value, 4, 5])); + let field = Field::new("col", DataType::Int8, false); + let sbbf = build_sbbf(array, field.clone()); + + // Direct check should fail (bloom filter has i32) + assert!(!sbbf.check(&test_value), "Direct check should fail"); + + // ArrowSbbf check should succeed (coerces i8 to i32) + let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type()); + assert!( + arrow_sbbf.check(&test_value), + "ArrowSbbf check should succeed" + ); + } + + #[test] + fn test_int16_coercion() { + // Int16 -> Parquet INT32 (sign extension changes bytes) + let test_value = 300_i16; + let array = Arc::new(Int16Array::from(vec![100_i16, 200, test_value, 400, 500])); + let field = Field::new("col", DataType::Int16, false); + let sbbf = build_sbbf(array, field.clone()); + + // Direct check should fail (bloom filter has i32) + assert!(!sbbf.check(&test_value), "Direct check should fail"); + + // ArrowSbbf check should succeed (coerces i16 to i32) + let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type()); + assert!( + arrow_sbbf.check(&test_value), + "ArrowSbbf check should succeed" + ); + } + + #[test] + fn test_uint8_coercion() { + // UInt8 -> Parquet INT32 (zero extension changes bytes) + let test_value = 30_u8; + let array = Arc::new(UInt8Array::from(vec![10_u8, 20, test_value, 40, 50])); + let field = Field::new("col", DataType::UInt8, false); + let sbbf = build_sbbf(array, field.clone()); + + // Direct check should fail (bloom filter has i32) + assert!(!sbbf.check(&test_value), "Direct check should fail"); + + // ArrowSbbf check should succeed (coerces u8 to i32) + let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type()); + assert!( + arrow_sbbf.check(&test_value), + "ArrowSbbf check should succeed" + ); + } + + #[test] + fn test_uint16_coercion() { + // UInt16 -> Parquet INT32 (zero extension changes bytes) + let test_value = 3000_u16; + let array = Arc::new(UInt16Array::from(vec![ + 1000_u16, 2000, test_value, 4000, 5000, + ])); + let field = Field::new("col", DataType::UInt16, false); + let sbbf = build_sbbf(array, field.clone()); + + // Direct check should fail (bloom filter has i32) + assert!(!sbbf.check(&test_value), "Direct check should fail"); + + // ArrowSbbf check should succeed (coerces u16 to i32) + let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type()); + assert!( + arrow_sbbf.check(&test_value), + "ArrowSbbf check should succeed" + ); + } + + #[test] + fn test_uint32_no_coercion() { + // UInt32 -> Parquet INT32 (reinterpret cast preserves bit pattern) + let test_value = 3_000_000_000_u32; // > i32::MAX + let array = Arc::new(UInt32Array::from(vec![ + 100_u32, + test_value, + 4_000_000_000_u32, + ])); + let field = Field::new("col", DataType::UInt32, false); + let sbbf = build_sbbf(array, field.clone()); + + // Direct check should succeed (bit pattern preserved) + assert!(sbbf.check(&test_value), "Direct check should succeed"); + + // ArrowSbbf check should also succeed + let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type()); + assert!( + arrow_sbbf.check(&test_value), + "ArrowSbbf check should succeed" + ); + } + + #[test] + fn test_uint64_no_coercion() { + // UInt64 -> Parquet INT64 (reinterpret cast preserves bit pattern) + let test_value = 10_000_000_000_000_000_000_u64; // > i64::MAX + let array = Arc::new(UInt64Array::from(vec![ + 100_u64, + test_value, + 15_000_000_000_000_000_000_u64, + ])); + let field = Field::new("col", DataType::UInt64, false); + let sbbf = build_sbbf(array, field.clone()); + + // Direct check should succeed (bit pattern preserved) + assert!(sbbf.check(&test_value), "Direct check should succeed"); + + // ArrowSbbf check should also succeed + let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type()); + assert!( + arrow_sbbf.check(&test_value), + "ArrowSbbf check should succeed" + ); + } + + #[test] + fn test_decimal128_small_coercion() { + // Decimal128(5, 2) -> Parquet INT32 (precision 1-9, truncation changes bytes) + let test_value = 20075_i128; + let array = Decimal128Array::from(vec![10050_i128, test_value, 30099_i128]) + .with_precision_and_scale(5, 2) + .unwrap(); + let field = Field::new("col", DataType::Decimal128(5, 2), false); + let schema = Arc::new(Schema::new(vec![field.clone()])); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); + + // Write with bloom filter + let mut file = tempfile().unwrap(); + let props = WriterProperties::builder() + .set_bloom_filter_enabled(true) + .build(); + let mut writer = ArrowWriter::try_new(&mut file, schema, Some(props)).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + // Read back + let options = ReadOptionsBuilder::new() + .with_reader_properties( + ReaderProperties::builder() + .set_read_bloom_filter(true) + .build(), + ) + .build(); + let reader = SerializedFileReader::new_with_options(file, options).unwrap(); + let row_group_reader = reader.get_row_group(0).unwrap(); + let bloom_filter = row_group_reader + .get_column_bloom_filter(0) + .expect("Bloom filter should exist"); + + // Test 1: Direct check with i128 bytes should fail (bloom filter has i32) + let test_bytes = test_value.to_le_bytes(); + let direct_result = bloom_filter.check(&test_bytes[..]); + println!( + "Decimal128(5,2): Direct Sbbf check with i128 bytes = {}", + direct_result + ); + assert!( + !direct_result, + "Direct check with i128 bytes should fail (bloom filter has i32)" + ); + + // Test 2: ArrowSbbf check should succeed (coerces i128 to i32) + let arrow_sbbf = ArrowSbbf::new(&bloom_filter, field.data_type()); + let arrow_result = arrow_sbbf.check(&test_bytes[..]); + println!( + "Decimal128(5,2): ArrowSbbf check with i128 bytes = {}", + arrow_result + ); + assert!( + arrow_result, + "ArrowSbbf check should succeed (coerces to i32)" + ); + } + + #[test] + fn test_decimal128_medium_coercion() { + // Decimal128(15, 2) -> Parquet INT64 (precision 10-18, truncation changes bytes) + // Direct Sbbf check: false (bug - checking i128 but filter has i64) + // ArrowSbbf check: true (fix - coerces i128 to i64) + let test_value = 9876543210987_i128; + let array = Decimal128Array::from(vec![1234567890123_i128, test_value, 5555555555555_i128]) + .with_precision_and_scale(15, 2) + .unwrap(); + let field = Field::new("col", DataType::Decimal128(15, 2), false); + let schema = Arc::new(Schema::new(vec![field.clone()])); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); + + // Write with bloom filter + let mut file = tempfile().unwrap(); + let props = WriterProperties::builder() + .set_bloom_filter_enabled(true) + .build(); + let mut writer = ArrowWriter::try_new(&mut file, schema, Some(props)).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + // Read back + let options = ReadOptionsBuilder::new() + .with_reader_properties( + ReaderProperties::builder() + .set_read_bloom_filter(true) + .build(), + ) + .build(); + let reader = SerializedFileReader::new_with_options(file, options).unwrap(); + let row_group_reader = reader.get_row_group(0).unwrap(); + let bloom_filter = row_group_reader + .get_column_bloom_filter(0) + .expect("Bloom filter should exist"); + + // Test 1: Direct check with i128 bytes should fail (bloom filter has i64) + let test_bytes = test_value.to_le_bytes(); + let direct_result = bloom_filter.check(&test_bytes[..]); + println!( + "Decimal128(15,2): Direct Sbbf check with i128 bytes = {}", + direct_result + ); + assert!( + !direct_result, + "Direct check with i128 bytes should fail (bloom filter has i64)" + ); + + // Test 2: ArrowSbbf check should succeed (coerces i128 to i64) + let arrow_sbbf = ArrowSbbf::new(&bloom_filter, field.data_type()); + let arrow_result = arrow_sbbf.check(&test_bytes[..]); + println!( + "Decimal128(15,2): ArrowSbbf check with i128 bytes = {}", + arrow_result + ); + assert!( + arrow_result, + "ArrowSbbf check should succeed (coerces to i64)" + ); + } + + #[test] + fn test_decimal32_no_coercion() { + // Decimal32(5, 2) -> Parquet INT32 (reinterpret cast preserves bit pattern for small precision) + // Direct Sbbf check: true (works without ArrowSbbf for Decimal32) + // ArrowSbbf check: true (wrapper doesn't hurt) + // Note: Decimal32 stores i32 natively, so no truncation occurs + let test_value = 20075_i32; + let array = Decimal32Array::from(vec![10050_i32, test_value, 30099_i32]) + .with_precision_and_scale(5, 2) + .unwrap(); + let field = Field::new("col", DataType::Decimal32(5, 2), false); + let sbbf = build_sbbf(Arc::new(array), field.clone()); + + // Direct check should succeed (bit pattern preserved) + assert!(sbbf.check(&test_value), "Direct check should succeed"); + + // ArrowSbbf check should also succeed + let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type()); + assert!( + arrow_sbbf.check(&test_value), + "ArrowSbbf check should succeed" + ); + } + + #[test] + fn test_float16_no_coercion() { + // Float16 -> Parquet FIXED_LEN_BYTE_ARRAY(2) (stored as-is) + // Direct Sbbf check: true (bit pattern preserved) + // ArrowSbbf check: true (wrapper doesn't hurt) + use half::f16; + let test_value = f16::from_f32(2.5); + let array = Float16Array::from(vec![f16::from_f32(1.5), test_value, f16::from_f32(3.5)]); + let field = Field::new("col", DataType::Float16, false); + let schema = Arc::new(Schema::new(vec![field.clone()])); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); + + // Write with bloom filter + let mut file = tempfile().unwrap(); + let props = WriterProperties::builder() + .set_bloom_filter_enabled(true) + .build(); + let mut writer = ArrowWriter::try_new(&mut file, schema, Some(props)).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + // Read back + let options = ReadOptionsBuilder::new() + .with_reader_properties( + ReaderProperties::builder() + .set_read_bloom_filter(true) + .build(), + ) + .build(); + let reader = SerializedFileReader::new_with_options(file, options).unwrap(); + let row_group_reader = reader.get_row_group(0).unwrap(); + let bloom_filter = row_group_reader + .get_column_bloom_filter(0) + .expect("Bloom filter should exist"); + + // Test 1: Direct check with f16 bytes should work (bit pattern preserved) + let test_bytes = test_value.to_le_bytes(); + let direct_result = bloom_filter.check(&test_bytes[..]); + println!("Float16: Direct Sbbf check with bytes = {}", direct_result); + assert!(direct_result, "Direct check with bytes should succeed"); + + // Test 2: ArrowSbbf check should also work + let arrow_sbbf = ArrowSbbf::new(&bloom_filter, field.data_type()); + let arrow_result = arrow_sbbf.check(&test_bytes[..]); + println!("Float16: ArrowSbbf check with bytes = {}", arrow_result); + assert!(arrow_result, "ArrowSbbf check should succeed"); + } + + #[test] + fn test_date64_no_coercion_by_default() { + // Date64 -> Parquet INT64 (stored as-is when coerce_types=false, which is default) + // Direct Sbbf check: true (no coercion by default) + // ArrowSbbf check: true (wrapper doesn't hurt) + // Note: Date64 CAN be coerced to Date32/INT32 with coerce_types=true, + // but that's not the default behavior, so bloom filters work correctly by default + let test_value = 2_000_000_000_i64; + let array = Date64Array::from(vec![1_000_000_000_i64, test_value, 3_000_000_000_i64]); + let field = Field::new("col", DataType::Date64, false); + let sbbf = build_sbbf(Arc::new(array), field.clone()); + + // Direct check should succeed (no coercion by default) + assert!(sbbf.check(&test_value), "Direct check should succeed"); + + // ArrowSbbf check should also succeed + let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type()); + assert!( + arrow_sbbf.check(&test_value), + "ArrowSbbf check should succeed" + ); + } + + #[test] + fn test_date64_with_coerce_types() { + // This test documents a KNOWN LIMITATION of ArrowSbbf. + // + // When WriterProperties::set_coerce_types(true) is used: + // - Date64 (i64 milliseconds) -> Date32 (i32 days) -> Parquet INT32 + // - Conversion: milliseconds / 86_400_000 = days + // + // ArrowSbbf cannot currently detect this case because it only has access to + // the Arrow DataType, not the Parquet schema or coerce_types setting. + // + // Potential solutions (not implemented): + // 1. Double-check both i64 and i32 representations (increases false positive rate ~2x) + // 2. Change ArrowSbbf API to accept Parquet column descriptor (breaking change) + // 3. Require users to manually convert ms -> days when using coerce_types + // + // We chose option 3 (documented limitation) because: + // - coerce_types=true is not the default + // - Date64 + bloom filters + coerce_types is a rare combination + // - Can wait for user feedback before adding complexity + + const MS_PER_DAY: i64 = 86_400_000; + let test_value_ms = 10 * MS_PER_DAY; // 10 days in milliseconds + let array = Date64Array::from(vec![5 * MS_PER_DAY, test_value_ms, 15 * MS_PER_DAY]); + let field = Field::new("col", DataType::Date64, false); + let schema = Arc::new(Schema::new(vec![field.clone()])); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); + + // Write with bloom filter AND coerce_types enabled + let mut file = tempfile().unwrap(); + let props = WriterProperties::builder() + .set_bloom_filter_enabled(true) + .set_coerce_types(true) // This causes Date64 -> Date32 (INT32) + .build(); + let mut writer = ArrowWriter::try_new(&mut file, schema, Some(props)).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + // Read back + let options = ReadOptionsBuilder::new() + .with_reader_properties( + ReaderProperties::builder() + .set_read_bloom_filter(true) + .build(), + ) + .build(); + let reader = SerializedFileReader::new_with_options(file, options).unwrap(); + let row_group_reader = reader.get_row_group(0).unwrap(); + let bloom_filter = row_group_reader + .get_column_bloom_filter(0) + .expect("Bloom filter should exist"); + + // Test 1: Direct check with i64 milliseconds fails (bloom filter has i32 days) + let direct_result = bloom_filter.check(&test_value_ms); + println!( + "Date64 (coerce_types=true): Direct Sbbf check with i64 ms = {}", + direct_result + ); + assert!( + !direct_result, + "Direct check with i64 ms should fail (bloom filter has i32 days)" + ); + + // Test 2: ArrowSbbf also fails (KNOWN LIMITATION - not yet implemented) + let arrow_sbbf = ArrowSbbf::new(&bloom_filter, field.data_type()); + let arrow_result = arrow_sbbf.check(&test_value_ms); + println!( + "Date64 (coerce_types=true): ArrowSbbf check with i64 ms = {}", + arrow_result + ); + assert!( + !arrow_result, + "ArrowSbbf cannot handle Date64 coercion (documented limitation)" + ); + + // Workaround: Manually convert to days before checking + let days = (test_value_ms / MS_PER_DAY) as i32; + let workaround_result = bloom_filter.check(&days); + println!( + "Date64 (coerce_types=true): Manual conversion to i32 days = {}", + workaround_result + ); + assert!( + workaround_result, + "Workaround: checking with i32 days should succeed" + ); + } +} diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 69274e689ae2..0b1f5884f296 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -182,6 +182,7 @@ experimental!(mod array_reader); pub mod arrow_reader; pub mod arrow_writer; +pub mod bloom_filter; mod buffer; mod decoder; From 15f131c52de3f928fd164f4a687c318d9242d832 Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Thu, 2 Oct 2025 17:00:08 -0700 Subject: [PATCH 02/17] feat(parquet): sbbf benchmarks --- parquet/Cargo.toml | 5 + parquet/benches/bloom_filter.rs | 169 ++++++++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 parquet/benches/bloom_filter.rs diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 09b9a916917a..3cdfb83e9f15 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -257,5 +257,10 @@ name = "row_selector" harness = false required-features = ["arrow"] +[[bench]] +name = "bloom_filter" +harness = false +required-features = ["arrow"] + [lib] bench = false diff --git a/parquet/benches/bloom_filter.rs b/parquet/benches/bloom_filter.rs new file mode 100644 index 000000000000..21998a03c9f7 --- /dev/null +++ b/parquet/benches/bloom_filter.rs @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{ArrayRef, Decimal128Array, Int32Array, Int8Array, RecordBatch}; +use arrow_schema::{DataType, Field, Schema}; +use criterion::*; +use parquet::arrow::arrow_writer::ArrowWriter; +use parquet::arrow::bloom_filter::ArrowSbbf; +use parquet::bloom_filter::Sbbf; +use parquet::file::properties::{ReaderProperties, WriterProperties}; +use parquet::file::reader::{FileReader, SerializedFileReader}; +use parquet::file::serialized_reader::ReadOptionsBuilder; +use std::hint; +use std::sync::Arc; +use tempfile::tempfile; + +/// Helper function to create an Sbbf from an array for benchmarking +/// +/// Writes the given array to a Parquet file with bloom filters enabled, +/// then reads it back and returns the bloom filter for the first column. +fn setup_sbbf(array: ArrayRef, field: Field) -> Sbbf { + let schema = Arc::new(Schema::new(vec![field.clone()])); + let batch = RecordBatch::try_new(schema.clone(), vec![array]).unwrap(); + + // Write to parquet with bloom filter + let mut file = tempfile().unwrap(); + let props = WriterProperties::builder() + .set_bloom_filter_enabled(true) + .build(); + let mut writer = ArrowWriter::try_new(&mut file, schema, Some(props)).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + // Read bloom filter back + let options = ReadOptionsBuilder::new() + .with_reader_properties( + ReaderProperties::builder() + .set_read_bloom_filter(true) + .build(), + ) + .build(); + let reader = SerializedFileReader::new_with_options(file, options).unwrap(); + let row_group_reader = reader.get_row_group(0).unwrap(); + + row_group_reader + .get_column_bloom_filter(0) + .expect("Bloom filter should exist") + .clone() +} + +fn bench_integer_types(c: &mut Criterion) { + // Setup for Int8 benchmarks (requires coercion to i32) + let int8_array = Arc::new(Int8Array::from(vec![42i8; 1000])) as ArrayRef; + let int8_field = Field::new("col", DataType::Int8, false); + let sbbf_int8 = setup_sbbf(int8_array, int8_field); + let arrow_sbbf_int8 = ArrowSbbf::new(&sbbf_int8, &DataType::Int8); + let test_val_i32 = 42i32; + let test_val_i8 = 42i8; + + // Setup for Int32 benchmarks (no coercion needed) + let int32_array = Arc::new(Int32Array::from(vec![42i32; 1000])) as ArrayRef; + let int32_field = Field::new("col", DataType::Int32, false); + let sbbf_int32 = setup_sbbf(int32_array, int32_field); + let arrow_sbbf_int32 = ArrowSbbf::new(&sbbf_int32, &DataType::Int32); + + // Benchmark 1: Direct Sbbf::check with i32 (baseline) + c.bench_function("Sbbf::check i32", |b| { + b.iter(|| { + let result = sbbf_int8.check(&test_val_i32); + hint::black_box(result); + }) + }); + + // Benchmark 2: ArrowSbbf::check with i8 (requires coercion to i32) + c.bench_function("ArrowSbbf::check i8 (coerce to i32)", |b| { + b.iter(|| { + let result = arrow_sbbf_int8.check(&test_val_i8); + hint::black_box(result); + }) + }); + + // Benchmark 3: ArrowSbbf::check with i32 (no coercion, passthrough) + c.bench_function("ArrowSbbf::check i32 (no coercion)", |b| { + b.iter(|| { + let result = arrow_sbbf_int32.check(&test_val_i32); + hint::black_box(result); + }) + }); +} + +fn bench_decimal_types(c: &mut Criterion) { + // Setup for Decimal128 small precision (coerces to i32) + let dec_small_array = Arc::new( + Decimal128Array::from(vec![123456i128; 1000]) + .with_precision_and_scale(5, 2) + .unwrap(), + ) as ArrayRef; + let dec_small_field = Field::new("col", DataType::Decimal128(5, 2), false); + let sbbf_dec_small = setup_sbbf(dec_small_array, dec_small_field); + let arrow_sbbf_dec_small = ArrowSbbf::new(&sbbf_dec_small, &DataType::Decimal128(5, 2)); + let test_val_dec_small = 123456i128; + + // Setup for Decimal128 medium precision (coerces to i64) + let dec_medium_array = Arc::new( + Decimal128Array::from(vec![123456789012345i128; 1000]) + .with_precision_and_scale(15, 2) + .unwrap(), + ) as ArrayRef; + let dec_medium_field = Field::new("col", DataType::Decimal128(15, 2), false); + let sbbf_dec_medium = setup_sbbf(dec_medium_array, dec_medium_field); + let arrow_sbbf_dec_medium = ArrowSbbf::new(&sbbf_dec_medium, &DataType::Decimal128(15, 2)); + let test_val_dec_medium = 123456789012345i128; + + // Setup for Decimal128 large precision (no coercion) + let dec_large_array = Arc::new( + Decimal128Array::from(vec![123456789012345678901234567890i128; 1000]) + .with_precision_and_scale(30, 2) + .unwrap(), + ) as ArrayRef; + let dec_large_field = Field::new("col", DataType::Decimal128(30, 2), false); + let sbbf_dec_large = setup_sbbf(dec_large_array, dec_large_field); + let arrow_sbbf_dec_large = ArrowSbbf::new(&sbbf_dec_large, &DataType::Decimal128(30, 2)); + let test_val_dec_large = 123456789012345678901234567890i128; + + // Benchmark 1: ArrowSbbf::check with Decimal128 small (coerce to i32) + c.bench_function("ArrowSbbf::check Decimal128(5,2) (coerce to i32)", |b| { + b.iter(|| { + let test_bytes = test_val_dec_small.to_le_bytes(); + let result = arrow_sbbf_dec_small.check(&test_bytes[..]); + hint::black_box(result); + }) + }); + + // Benchmark 2: ArrowSbbf::check with Decimal128 medium (coerce to i64) + c.bench_function("ArrowSbbf::check Decimal128(15,2) (coerce to i64)", |b| { + b.iter(|| { + let test_bytes = test_val_dec_medium.to_le_bytes(); + let result = arrow_sbbf_dec_medium.check(&test_bytes[..]); + hint::black_box(result); + }) + }); + + // Benchmark 3: ArrowSbbf::check with Decimal128 large (no coercion) + c.bench_function("ArrowSbbf::check Decimal128(30,2) (no coercion)", |b| { + b.iter(|| { + let test_bytes = test_val_dec_large.to_le_bytes(); + let result = arrow_sbbf_dec_large.check(&test_bytes[..]); + hint::black_box(result); + }) + }); +} + +criterion_group!(benches_int, bench_integer_types); +criterion_group!(benches_decimal, bench_decimal_types); +criterion_main!(benches_int, benches_decimal); From 8e372d953bfeddf28e829a5fbaa1f7b293279733 Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Thu, 2 Oct 2025 17:47:48 -0700 Subject: [PATCH 03/17] fix(parquet): arrow sbbf date64 support --- parquet/benches/bloom_filter.rs | 34 ++-- parquet/src/arrow/bloom_filter.rs | 285 +++++++++++++++++------------- parquet/src/bloom_filter/mod.rs | 2 +- 3 files changed, 181 insertions(+), 140 deletions(-) diff --git a/parquet/benches/bloom_filter.rs b/parquet/benches/bloom_filter.rs index 21998a03c9f7..6eb86c7007a2 100644 --- a/parquet/benches/bloom_filter.rs +++ b/parquet/benches/bloom_filter.rs @@ -20,6 +20,7 @@ use arrow_schema::{DataType, Field, Schema}; use criterion::*; use parquet::arrow::arrow_writer::ArrowWriter; use parquet::arrow::bloom_filter::ArrowSbbf; +use parquet::basic::Type as ParquetType; use parquet::bloom_filter::Sbbf; use parquet::file::properties::{ReaderProperties, WriterProperties}; use parquet::file::reader::{FileReader, SerializedFileReader}; @@ -31,8 +32,8 @@ use tempfile::tempfile; /// Helper function to create an Sbbf from an array for benchmarking /// /// Writes the given array to a Parquet file with bloom filters enabled, -/// then reads it back and returns the bloom filter for the first column. -fn setup_sbbf(array: ArrayRef, field: Field) -> Sbbf { +/// then reads it back and returns the bloom filter and physical type for the first column. +fn setup_sbbf(array: ArrayRef, field: Field) -> (Sbbf, ParquetType) { let schema = Arc::new(Schema::new(vec![field.clone()])); let batch = RecordBatch::try_new(schema.clone(), vec![array]).unwrap(); @@ -55,27 +56,32 @@ fn setup_sbbf(array: ArrayRef, field: Field) -> Sbbf { .build(); let reader = SerializedFileReader::new_with_options(file, options).unwrap(); let row_group_reader = reader.get_row_group(0).unwrap(); + let metadata = row_group_reader.metadata(); + let column_chunk = metadata.column(0); + let parquet_type = column_chunk.column_type(); - row_group_reader + let sbbf = row_group_reader .get_column_bloom_filter(0) .expect("Bloom filter should exist") - .clone() + .clone(); + + (sbbf, parquet_type) } fn bench_integer_types(c: &mut Criterion) { // Setup for Int8 benchmarks (requires coercion to i32) let int8_array = Arc::new(Int8Array::from(vec![42i8; 1000])) as ArrayRef; let int8_field = Field::new("col", DataType::Int8, false); - let sbbf_int8 = setup_sbbf(int8_array, int8_field); - let arrow_sbbf_int8 = ArrowSbbf::new(&sbbf_int8, &DataType::Int8); + let (sbbf_int8, physical_type_int8) = setup_sbbf(int8_array, int8_field); + let arrow_sbbf_int8 = ArrowSbbf::new(&sbbf_int8, &DataType::Int8, physical_type_int8); let test_val_i32 = 42i32; let test_val_i8 = 42i8; // Setup for Int32 benchmarks (no coercion needed) let int32_array = Arc::new(Int32Array::from(vec![42i32; 1000])) as ArrayRef; let int32_field = Field::new("col", DataType::Int32, false); - let sbbf_int32 = setup_sbbf(int32_array, int32_field); - let arrow_sbbf_int32 = ArrowSbbf::new(&sbbf_int32, &DataType::Int32); + let (sbbf_int32, physical_type_int32) = setup_sbbf(int32_array, int32_field); + let arrow_sbbf_int32 = ArrowSbbf::new(&sbbf_int32, &DataType::Int32, physical_type_int32); // Benchmark 1: Direct Sbbf::check with i32 (baseline) c.bench_function("Sbbf::check i32", |b| { @@ -110,8 +116,8 @@ fn bench_decimal_types(c: &mut Criterion) { .unwrap(), ) as ArrayRef; let dec_small_field = Field::new("col", DataType::Decimal128(5, 2), false); - let sbbf_dec_small = setup_sbbf(dec_small_array, dec_small_field); - let arrow_sbbf_dec_small = ArrowSbbf::new(&sbbf_dec_small, &DataType::Decimal128(5, 2)); + let (sbbf_dec_small, physical_type_dec_small) = setup_sbbf(dec_small_array, dec_small_field); + let arrow_sbbf_dec_small = ArrowSbbf::new(&sbbf_dec_small, &DataType::Decimal128(5, 2), physical_type_dec_small); let test_val_dec_small = 123456i128; // Setup for Decimal128 medium precision (coerces to i64) @@ -121,8 +127,8 @@ fn bench_decimal_types(c: &mut Criterion) { .unwrap(), ) as ArrayRef; let dec_medium_field = Field::new("col", DataType::Decimal128(15, 2), false); - let sbbf_dec_medium = setup_sbbf(dec_medium_array, dec_medium_field); - let arrow_sbbf_dec_medium = ArrowSbbf::new(&sbbf_dec_medium, &DataType::Decimal128(15, 2)); + let (sbbf_dec_medium, physical_type_dec_medium) = setup_sbbf(dec_medium_array, dec_medium_field); + let arrow_sbbf_dec_medium = ArrowSbbf::new(&sbbf_dec_medium, &DataType::Decimal128(15, 2), physical_type_dec_medium); let test_val_dec_medium = 123456789012345i128; // Setup for Decimal128 large precision (no coercion) @@ -132,8 +138,8 @@ fn bench_decimal_types(c: &mut Criterion) { .unwrap(), ) as ArrayRef; let dec_large_field = Field::new("col", DataType::Decimal128(30, 2), false); - let sbbf_dec_large = setup_sbbf(dec_large_array, dec_large_field); - let arrow_sbbf_dec_large = ArrowSbbf::new(&sbbf_dec_large, &DataType::Decimal128(30, 2)); + let (sbbf_dec_large, physical_type_dec_large) = setup_sbbf(dec_large_array, dec_large_field); + let arrow_sbbf_dec_large = ArrowSbbf::new(&sbbf_dec_large, &DataType::Decimal128(30, 2), physical_type_dec_large); let test_val_dec_large = 123456789012345678901234567890i128; // Benchmark 1: ArrowSbbf::check with Decimal128 small (coerce to i32) diff --git a/parquet/src/arrow/bloom_filter.rs b/parquet/src/arrow/bloom_filter.rs index 7e2f041e66f5..b3b195c573e5 100644 --- a/parquet/src/arrow/bloom_filter.rs +++ b/parquet/src/arrow/bloom_filter.rs @@ -47,9 +47,13 @@ //! use arrow_schema::DataType; //! use parquet::arrow::bloom_filter::ArrowSbbf; //! -//! // Get bloom filter from row group reader +//! // Get bloom filter and metadata from row group reader +//! let column_chunk = row_group_reader.metadata().column(0); //! let sbbf = row_group_reader.get_column_bloom_filter(0)?; -//! let arrow_sbbf = ArrowSbbf::new(&sbbf, &DataType::Int8); +//! let parquet_type = column_chunk.column_type(); +//! +//! // Create ArrowSbbf with both logical and physical types +//! let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Int8, parquet_type); //! //! // Check with i8 value - automatically coerced to i32 //! let value: i8 = 42; @@ -58,25 +62,22 @@ //! } //! ``` //! -//! # Known Limitations +//! # Date64 Handling //! -//! **Date64 with coerce_types=true**: When [`WriterProperties::set_coerce_types`] is enabled, -//! `Date64` values (i64 milliseconds) are coerced to `Date32` (i32 days). Currently, -//! [`ArrowSbbf`] cannot detect this case and will produce false negatives when checking -//! `Date64` values against bloom filters created with `coerce_types=true`. +//! [`ArrowSbbf`] correctly handles `Date64` values regardless of whether +//! [`WriterProperties::set_coerce_types`] was used: //! -//! Workaround: Manually convert milliseconds to days before checking: -//! ```ignore -//! let ms = 864_000_000_i64; // 10 days in milliseconds -//! let days = (ms / 86_400_000) as i32; -//! arrow_sbbf.check(&days); // Check with i32 days instead -//! ``` +//! - When stored as INT64 (default): Values checked as-is (milliseconds since epoch) +//! - When stored as INT32 (coerce_types=true): Values automatically converted from milliseconds to days +//! +//! The physical type determines the correct conversion automatically. //! //! [`WriterProperties::set_coerce_types`]: crate::file::properties::WriterPropertiesBuilder::set_coerce_types +use crate::basic::Type as ParquetType; use crate::bloom_filter::Sbbf; use crate::data_type::AsBytes; -use arrow_schema::DataType; +use arrow_schema::DataType as ArrowType; /// Wraps an [`Sbbf`] and provides automatic type coercion based on Arrow schema. /// Ensures that checking bloom filters works correctly for Arrow types that require @@ -84,15 +85,27 @@ use arrow_schema::DataType; #[derive(Debug, Clone)] pub struct ArrowSbbf<'a> { sbbf: &'a Sbbf, - arrow_type: &'a DataType, + arrow_type: &'a ArrowType, + parquet_type: ParquetType, } impl<'a> ArrowSbbf<'a> { /// Create a new Arrow-aware bloom filter wrapper + /// + /// # Arguments /// * `sbbf` - Parquet bloom filter for the column - /// * `arrow_type` - Arrow data type for the column - pub fn new(sbbf: &'a Sbbf, arrow_type: &'a DataType) -> Self { - Self { sbbf, arrow_type } + /// * `arrow_type` - Arrow data type for the column (logical type) + /// * `parquet_type` - Parquet physical type for the column + /// + /// The Parquet type can be obtained from [`ColumnChunkMetaData::column_type()`]. + /// + /// [`ColumnChunkMetaData::column_type()`]: crate::file::metadata::ColumnChunkMetaData::column_type + pub fn new(sbbf: &'a Sbbf, arrow_type: &'a ArrowType, parquet_type: ParquetType) -> Self { + Self { + sbbf, + arrow_type, + parquet_type, + } } /// Check if a value might be present in the bloom filter @@ -102,7 +115,7 @@ impl<'a> ArrowSbbf<'a> { /// or `false` if the value is definitely not present. pub fn check(&self, value: &T) -> bool { match self.arrow_type { - DataType::Int8 => { + ArrowType::Int8 => { // Arrow Int8 -> Parquet INT32 let bytes = value.as_bytes(); if bytes.len() == 1 { @@ -114,7 +127,7 @@ impl<'a> ArrowSbbf<'a> { self.sbbf.check(value) } } - DataType::Int16 => { + ArrowType::Int16 => { // Arrow Int16 -> Parquet INT32 let bytes = value.as_bytes(); if bytes.len() == 2 { @@ -126,7 +139,7 @@ impl<'a> ArrowSbbf<'a> { self.sbbf.check(value) } } - DataType::UInt8 => { + ArrowType::UInt8 => { // Arrow UInt8 -> Parquet INT32 let bytes = value.as_bytes(); if bytes.len() == 1 { @@ -139,7 +152,7 @@ impl<'a> ArrowSbbf<'a> { self.sbbf.check(value) } } - DataType::UInt16 => { + ArrowType::UInt16 => { // Arrow UInt16 -> Parquet INT32 let bytes = value.as_bytes(); if bytes.len() == 2 { @@ -152,10 +165,10 @@ impl<'a> ArrowSbbf<'a> { self.sbbf.check(value) } } - DataType::Decimal32(precision, _) - | DataType::Decimal64(precision, _) - | DataType::Decimal128(precision, _) - | DataType::Decimal256(precision, _) + ArrowType::Decimal32(precision, _) + | ArrowType::Decimal64(precision, _) + | ArrowType::Decimal128(precision, _) + | ArrowType::Decimal256(precision, _) if *precision >= 1 && *precision <= 9 => { // Decimal with precision 1-9 -> Parquet INT32 @@ -200,9 +213,9 @@ impl<'a> ArrowSbbf<'a> { _ => self.sbbf.check(value), } } - DataType::Decimal64(precision, _) - | DataType::Decimal128(precision, _) - | DataType::Decimal256(precision, _) + ArrowType::Decimal64(precision, _) + | ArrowType::Decimal128(precision, _) + | ArrowType::Decimal256(precision, _) if *precision >= 10 && *precision <= 18 => { // Decimal with precision 10-18 -> Parquet INT64 @@ -241,6 +254,36 @@ impl<'a> ArrowSbbf<'a> { _ => self.sbbf.check(value), } } + ArrowType::Date64 => { + // Date64 can be stored as either INT32 or INT64 depending on coerce_types + let bytes = value.as_bytes(); + if bytes.len() == 8 { + let i64_val = i64::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], + bytes[7], + ]); + match self.parquet_type { + ParquetType::INT32 => { + // Date64 was coerced to Date32 (days since epoch) + // Convert milliseconds to days: ms / 86_400_000 + const MS_PER_DAY: i64 = 86_400_000; + let days = (i64_val / MS_PER_DAY) as i32; + self.sbbf.check(&days) + } + ParquetType::INT64 => { + // Date64 stored as-is (milliseconds since epoch) + self.sbbf.check(&i64_val) + } + _ => { + // Unexpected physical type for Date64, fall back to direct check + self.sbbf.check(value) + } + } + } else { + // Unexpected size, fall back to direct check + self.sbbf.check(value) + } + } // No coercion needed _ => self.sbbf.check(value), } @@ -258,24 +301,35 @@ mod tests { ArrayRef, Date64Array, Decimal128Array, Decimal32Array, Float16Array, Int16Array, Int8Array, RecordBatch, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; - use arrow_schema::{DataType, Field, Schema}; + use arrow_schema::{Field, Schema}; use std::sync::Arc; use tempfile::tempfile; /// Helper function to build a bloom filter for testing /// /// Writes the given array to a Parquet file with bloom filters enabled, - /// then reads it back and returns the bloom filter for the first column. - fn build_sbbf(array: ArrayRef, field: Field) -> Sbbf { + /// then reads it back and returns the bloom filter and physical type for the first column. + fn build_sbbf(array: ArrayRef, field: Field) -> (Sbbf, ParquetType) { + build_sbbf_with_props( + array, + field, + WriterProperties::builder() + .set_bloom_filter_enabled(true) + .build(), + ) + } + + /// Helper function to build a bloom filter with custom writer properties + fn build_sbbf_with_props( + array: ArrayRef, + field: Field, + props: WriterProperties, + ) -> (Sbbf, ParquetType) { let schema = Arc::new(Schema::new(vec![field.clone()])); let batch = RecordBatch::try_new(schema.clone(), vec![array]).unwrap(); - // Write with bloom filter enabled + // Write with custom properties let mut file = tempfile().unwrap(); - let props = WriterProperties::builder() - .set_bloom_filter_enabled(true) - .build(); - let mut writer = ArrowWriter::try_new(&mut file, schema, Some(props)).unwrap(); writer.write(&batch).unwrap(); writer.close().unwrap(); @@ -290,12 +344,17 @@ mod tests { .build(); let reader = SerializedFileReader::new_with_options(file, options).unwrap(); - // Get and return the bloom filter + // Get bloom filter and physical type let row_group_reader = reader.get_row_group(0).unwrap(); - row_group_reader + let metadata = row_group_reader.metadata(); + let column_chunk = metadata.column(0); + let parquet_type = column_chunk.column_type(); + let sbbf = row_group_reader .get_column_bloom_filter(0) .expect("Bloom filter should exist") - .clone() + .clone(); + + (sbbf, parquet_type) } #[test] @@ -303,14 +362,14 @@ mod tests { // Int8 -> Parquet INT32 (sign extension changes bytes) let test_value = 3_i8; let array = Arc::new(Int8Array::from(vec![1_i8, 2, test_value, 4, 5])); - let field = Field::new("col", DataType::Int8, false); - let sbbf = build_sbbf(array, field.clone()); + let field = Field::new("col", ArrowType::Int8, false); + let (sbbf, parquet_type) = build_sbbf(array, field.clone()); // Direct check should fail (bloom filter has i32) assert!(!sbbf.check(&test_value), "Direct check should fail"); // ArrowSbbf check should succeed (coerces i8 to i32) - let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type()); + let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type(), parquet_type); assert!( arrow_sbbf.check(&test_value), "ArrowSbbf check should succeed" @@ -322,14 +381,14 @@ mod tests { // Int16 -> Parquet INT32 (sign extension changes bytes) let test_value = 300_i16; let array = Arc::new(Int16Array::from(vec![100_i16, 200, test_value, 400, 500])); - let field = Field::new("col", DataType::Int16, false); - let sbbf = build_sbbf(array, field.clone()); + let field = Field::new("col", ArrowType::Int16, false); + let (sbbf, parquet_type) = build_sbbf(array, field.clone()); // Direct check should fail (bloom filter has i32) assert!(!sbbf.check(&test_value), "Direct check should fail"); // ArrowSbbf check should succeed (coerces i16 to i32) - let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type()); + let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type(), parquet_type); assert!( arrow_sbbf.check(&test_value), "ArrowSbbf check should succeed" @@ -341,14 +400,14 @@ mod tests { // UInt8 -> Parquet INT32 (zero extension changes bytes) let test_value = 30_u8; let array = Arc::new(UInt8Array::from(vec![10_u8, 20, test_value, 40, 50])); - let field = Field::new("col", DataType::UInt8, false); - let sbbf = build_sbbf(array, field.clone()); + let field = Field::new("col", ArrowType::UInt8, false); + let (sbbf, parquet_type) = build_sbbf(array, field.clone()); // Direct check should fail (bloom filter has i32) assert!(!sbbf.check(&test_value), "Direct check should fail"); // ArrowSbbf check should succeed (coerces u8 to i32) - let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type()); + let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type(), parquet_type); assert!( arrow_sbbf.check(&test_value), "ArrowSbbf check should succeed" @@ -362,14 +421,14 @@ mod tests { let array = Arc::new(UInt16Array::from(vec![ 1000_u16, 2000, test_value, 4000, 5000, ])); - let field = Field::new("col", DataType::UInt16, false); - let sbbf = build_sbbf(array, field.clone()); + let field = Field::new("col", ArrowType::UInt16, false); + let (sbbf, parquet_type) = build_sbbf(array, field.clone()); // Direct check should fail (bloom filter has i32) assert!(!sbbf.check(&test_value), "Direct check should fail"); // ArrowSbbf check should succeed (coerces u16 to i32) - let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type()); + let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type(), parquet_type); assert!( arrow_sbbf.check(&test_value), "ArrowSbbf check should succeed" @@ -385,14 +444,14 @@ mod tests { test_value, 4_000_000_000_u32, ])); - let field = Field::new("col", DataType::UInt32, false); - let sbbf = build_sbbf(array, field.clone()); + let field = Field::new("col", ArrowType::UInt32, false); + let (sbbf, parquet_type) = build_sbbf(array, field.clone()); // Direct check should succeed (bit pattern preserved) assert!(sbbf.check(&test_value), "Direct check should succeed"); // ArrowSbbf check should also succeed - let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type()); + let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type(), parquet_type); assert!( arrow_sbbf.check(&test_value), "ArrowSbbf check should succeed" @@ -408,14 +467,14 @@ mod tests { test_value, 15_000_000_000_000_000_000_u64, ])); - let field = Field::new("col", DataType::UInt64, false); - let sbbf = build_sbbf(array, field.clone()); + let field = Field::new("col", ArrowType::UInt64, false); + let (sbbf, parquet_type) = build_sbbf(array, field.clone()); // Direct check should succeed (bit pattern preserved) assert!(sbbf.check(&test_value), "Direct check should succeed"); // ArrowSbbf check should also succeed - let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type()); + let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type(), parquet_type); assert!( arrow_sbbf.check(&test_value), "ArrowSbbf check should succeed" @@ -429,7 +488,7 @@ mod tests { let array = Decimal128Array::from(vec![10050_i128, test_value, 30099_i128]) .with_precision_and_scale(5, 2) .unwrap(); - let field = Field::new("col", DataType::Decimal128(5, 2), false); + let field = Field::new("col", ArrowType::Decimal128(5, 2), false); let schema = Arc::new(Schema::new(vec![field.clone()])); let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); @@ -452,6 +511,9 @@ mod tests { .build(); let reader = SerializedFileReader::new_with_options(file, options).unwrap(); let row_group_reader = reader.get_row_group(0).unwrap(); + let metadata = row_group_reader.metadata(); + let column_chunk = metadata.column(0); + let parquet_type = column_chunk.column_type(); let bloom_filter = row_group_reader .get_column_bloom_filter(0) .expect("Bloom filter should exist"); @@ -469,7 +531,7 @@ mod tests { ); // Test 2: ArrowSbbf check should succeed (coerces i128 to i32) - let arrow_sbbf = ArrowSbbf::new(&bloom_filter, field.data_type()); + let arrow_sbbf = ArrowSbbf::new(&bloom_filter, field.data_type(), parquet_type); let arrow_result = arrow_sbbf.check(&test_bytes[..]); println!( "Decimal128(5,2): ArrowSbbf check with i128 bytes = {}", @@ -490,7 +552,7 @@ mod tests { let array = Decimal128Array::from(vec![1234567890123_i128, test_value, 5555555555555_i128]) .with_precision_and_scale(15, 2) .unwrap(); - let field = Field::new("col", DataType::Decimal128(15, 2), false); + let field = Field::new("col", ArrowType::Decimal128(15, 2), false); let schema = Arc::new(Schema::new(vec![field.clone()])); let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); @@ -513,6 +575,9 @@ mod tests { .build(); let reader = SerializedFileReader::new_with_options(file, options).unwrap(); let row_group_reader = reader.get_row_group(0).unwrap(); + let metadata = row_group_reader.metadata(); + let column_chunk = metadata.column(0); + let parquet_type = column_chunk.column_type(); let bloom_filter = row_group_reader .get_column_bloom_filter(0) .expect("Bloom filter should exist"); @@ -530,7 +595,7 @@ mod tests { ); // Test 2: ArrowSbbf check should succeed (coerces i128 to i64) - let arrow_sbbf = ArrowSbbf::new(&bloom_filter, field.data_type()); + let arrow_sbbf = ArrowSbbf::new(&bloom_filter, field.data_type(), parquet_type); let arrow_result = arrow_sbbf.check(&test_bytes[..]); println!( "Decimal128(15,2): ArrowSbbf check with i128 bytes = {}", @@ -552,14 +617,14 @@ mod tests { let array = Decimal32Array::from(vec![10050_i32, test_value, 30099_i32]) .with_precision_and_scale(5, 2) .unwrap(); - let field = Field::new("col", DataType::Decimal32(5, 2), false); - let sbbf = build_sbbf(Arc::new(array), field.clone()); + let field = Field::new("col", ArrowType::Decimal32(5, 2), false); + let (sbbf, parquet_type) = build_sbbf(Arc::new(array), field.clone()); // Direct check should succeed (bit pattern preserved) assert!(sbbf.check(&test_value), "Direct check should succeed"); // ArrowSbbf check should also succeed - let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type()); + let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type(), parquet_type); assert!( arrow_sbbf.check(&test_value), "ArrowSbbf check should succeed" @@ -574,7 +639,7 @@ mod tests { use half::f16; let test_value = f16::from_f32(2.5); let array = Float16Array::from(vec![f16::from_f32(1.5), test_value, f16::from_f32(3.5)]); - let field = Field::new("col", DataType::Float16, false); + let field = Field::new("col", ArrowType::Float16, false); let schema = Arc::new(Schema::new(vec![field.clone()])); let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); @@ -597,6 +662,9 @@ mod tests { .build(); let reader = SerializedFileReader::new_with_options(file, options).unwrap(); let row_group_reader = reader.get_row_group(0).unwrap(); + let metadata = row_group_reader.metadata(); + let column_chunk = metadata.column(0); + let parquet_type = column_chunk.column_type(); let bloom_filter = row_group_reader .get_column_bloom_filter(0) .expect("Bloom filter should exist"); @@ -608,7 +676,7 @@ mod tests { assert!(direct_result, "Direct check with bytes should succeed"); // Test 2: ArrowSbbf check should also work - let arrow_sbbf = ArrowSbbf::new(&bloom_filter, field.data_type()); + let arrow_sbbf = ArrowSbbf::new(&bloom_filter, field.data_type(), parquet_type); let arrow_result = arrow_sbbf.check(&test_bytes[..]); println!("Float16: ArrowSbbf check with bytes = {}", arrow_result); assert!(arrow_result, "ArrowSbbf check should succeed"); @@ -623,14 +691,14 @@ mod tests { // but that's not the default behavior, so bloom filters work correctly by default let test_value = 2_000_000_000_i64; let array = Date64Array::from(vec![1_000_000_000_i64, test_value, 3_000_000_000_i64]); - let field = Field::new("col", DataType::Date64, false); - let sbbf = build_sbbf(Arc::new(array), field.clone()); + let field = Field::new("col", ArrowType::Date64, false); + let (sbbf, parquet_type) = build_sbbf(Arc::new(array), field.clone()); // Direct check should succeed (no coercion by default) assert!(sbbf.check(&test_value), "Direct check should succeed"); // ArrowSbbf check should also succeed - let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type()); + let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type(), parquet_type); assert!( arrow_sbbf.check(&test_value), "ArrowSbbf check should succeed" @@ -639,89 +707,56 @@ mod tests { #[test] fn test_date64_with_coerce_types() { - // This test documents a KNOWN LIMITATION of ArrowSbbf. + // This test verifies that ArrowSbbf correctly handles Date64 when coerce_types=true. // // When WriterProperties::set_coerce_types(true) is used: // - Date64 (i64 milliseconds) -> Date32 (i32 days) -> Parquet INT32 // - Conversion: milliseconds / 86_400_000 = days // - // ArrowSbbf cannot currently detect this case because it only has access to - // the Arrow DataType, not the Parquet schema or coerce_types setting. - // - // Potential solutions (not implemented): - // 1. Double-check both i64 and i32 representations (increases false positive rate ~2x) - // 2. Change ArrowSbbf API to accept Parquet column descriptor (breaking change) - // 3. Require users to manually convert ms -> days when using coerce_types - // - // We chose option 3 (documented limitation) because: - // - coerce_types=true is not the default - // - Date64 + bloom filters + coerce_types is a rare combination - // - Can wait for user feedback before adding complexity + // ArrowSbbf detects this by inspecting the physical type (INT32 vs INT64) + // and automatically performs the milliseconds-to-days conversion. const MS_PER_DAY: i64 = 86_400_000; let test_value_ms = 10 * MS_PER_DAY; // 10 days in milliseconds let array = Date64Array::from(vec![5 * MS_PER_DAY, test_value_ms, 15 * MS_PER_DAY]); - let field = Field::new("col", DataType::Date64, false); - let schema = Arc::new(Schema::new(vec![field.clone()])); - let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); + let field = Field::new("col", ArrowType::Date64, false); - // Write with bloom filter AND coerce_types enabled - let mut file = tempfile().unwrap(); + // Write with coerce_types enabled let props = WriterProperties::builder() .set_bloom_filter_enabled(true) .set_coerce_types(true) // This causes Date64 -> Date32 (INT32) .build(); - let mut writer = ArrowWriter::try_new(&mut file, schema, Some(props)).unwrap(); - writer.write(&batch).unwrap(); - writer.close().unwrap(); - - // Read back - let options = ReadOptionsBuilder::new() - .with_reader_properties( - ReaderProperties::builder() - .set_read_bloom_filter(true) - .build(), - ) - .build(); - let reader = SerializedFileReader::new_with_options(file, options).unwrap(); - let row_group_reader = reader.get_row_group(0).unwrap(); - let bloom_filter = row_group_reader - .get_column_bloom_filter(0) - .expect("Bloom filter should exist"); + let (bloom_filter, parquet_type) = + build_sbbf_with_props(Arc::new(array), field.clone(), props); + + // Verify physical type is INT32 (not INT64) + assert_eq!( + parquet_type, + ParquetType::INT32, + "Date64 with coerce_types=true should be stored as INT32" + ); // Test 1: Direct check with i64 milliseconds fails (bloom filter has i32 days) let direct_result = bloom_filter.check(&test_value_ms); - println!( - "Date64 (coerce_types=true): Direct Sbbf check with i64 ms = {}", - direct_result - ); assert!( !direct_result, "Direct check with i64 ms should fail (bloom filter has i32 days)" ); - // Test 2: ArrowSbbf also fails (KNOWN LIMITATION - not yet implemented) - let arrow_sbbf = ArrowSbbf::new(&bloom_filter, field.data_type()); + // Test 2: ArrowSbbf succeeds! It detects INT32 physical type and converts ms->days + let arrow_sbbf = ArrowSbbf::new(&bloom_filter, field.data_type(), parquet_type); let arrow_result = arrow_sbbf.check(&test_value_ms); - println!( - "Date64 (coerce_types=true): ArrowSbbf check with i64 ms = {}", - arrow_result - ); assert!( - !arrow_result, - "ArrowSbbf cannot handle Date64 coercion (documented limitation)" + arrow_result, + "ArrowSbbf should handle Date64 coercion automatically" ); - // Workaround: Manually convert to days before checking + // Test 3: Manual conversion also works let days = (test_value_ms / MS_PER_DAY) as i32; - let workaround_result = bloom_filter.check(&days); - println!( - "Date64 (coerce_types=true): Manual conversion to i32 days = {}", - workaround_result - ); + let manual_result = bloom_filter.check(&days); assert!( - workaround_result, - "Workaround: checking with i32 days should succeed" + manual_result, + "Manual conversion to i32 days should succeed" ); } } diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index 09302bab8fec..f54328a511a1 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -387,7 +387,7 @@ impl Sbbf { } /// Check if an [AsBytes] value is probably present or definitely absent in the filter - pub fn check(&self, value: &T) -> bool { + pub fn check(&self, value: &T) -> bool { self.check_hash(hash_as_bytes(value)) } From 94056faa140335012c0e104c5c386bbc9b2c1daa Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Thu, 2 Oct 2025 20:43:23 -0700 Subject: [PATCH 04/17] refactor(parquet): simplify bloom filter tests --- parquet/src/arrow/bloom_filter.rs | 393 ++++++++---------------------- 1 file changed, 100 insertions(+), 293 deletions(-) diff --git a/parquet/src/arrow/bloom_filter.rs b/parquet/src/arrow/bloom_filter.rs index b3b195c573e5..7b8a443f3770 100644 --- a/parquet/src/arrow/bloom_filter.rs +++ b/parquet/src/arrow/bloom_filter.rs @@ -308,8 +308,8 @@ mod tests { /// Helper function to build a bloom filter for testing /// /// Writes the given array to a Parquet file with bloom filters enabled, - /// then reads it back and returns the bloom filter and physical type for the first column. - fn build_sbbf(array: ArrayRef, field: Field) -> (Sbbf, ParquetType) { + /// then reads it back and returns the bloom filter for the first column. + fn build_sbbf(array: ArrayRef, field: Field) -> Sbbf { build_sbbf_with_props( array, field, @@ -320,11 +320,7 @@ mod tests { } /// Helper function to build a bloom filter with custom writer properties - fn build_sbbf_with_props( - array: ArrayRef, - field: Field, - props: WriterProperties, - ) -> (Sbbf, ParquetType) { + fn build_sbbf_with_props(array: ArrayRef, field: Field, props: WriterProperties) -> Sbbf { let schema = Arc::new(Schema::new(vec![field.clone()])); let batch = RecordBatch::try_new(schema.clone(), vec![array]).unwrap(); @@ -344,100 +340,78 @@ mod tests { .build(); let reader = SerializedFileReader::new_with_options(file, options).unwrap(); - // Get bloom filter and physical type + // Get bloom filter let row_group_reader = reader.get_row_group(0).unwrap(); - let metadata = row_group_reader.metadata(); - let column_chunk = metadata.column(0); - let parquet_type = column_chunk.column_type(); - let sbbf = row_group_reader + row_group_reader .get_column_bloom_filter(0) .expect("Bloom filter should exist") - .clone(); - - (sbbf, parquet_type) + .clone() } #[test] - fn test_int8_coercion() { - // Int8 -> Parquet INT32 (sign extension changes bytes) + fn test_check_int8() { let test_value = 3_i8; let array = Arc::new(Int8Array::from(vec![1_i8, 2, test_value, 4, 5])); let field = Field::new("col", ArrowType::Int8, false); - let (sbbf, parquet_type) = build_sbbf(array, field.clone()); + let sbbf = build_sbbf(array, field.clone()); - // Direct check should fail (bloom filter has i32) - assert!(!sbbf.check(&test_value), "Direct check should fail"); + // Check without coercion fails + assert!(!sbbf.check(&test_value)); - // ArrowSbbf check should succeed (coerces i8 to i32) - let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type(), parquet_type); - assert!( - arrow_sbbf.check(&test_value), - "ArrowSbbf check should succeed" - ); + // Arrow Int8 -> ParquetType::INT32 + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Int8, ParquetType::INT32); + assert!(arrow_sbbf.check(&test_value)); } #[test] - fn test_int16_coercion() { - // Int16 -> Parquet INT32 (sign extension changes bytes) + fn test_check_int16() { let test_value = 300_i16; let array = Arc::new(Int16Array::from(vec![100_i16, 200, test_value, 400, 500])); let field = Field::new("col", ArrowType::Int16, false); - let (sbbf, parquet_type) = build_sbbf(array, field.clone()); + let sbbf = build_sbbf(array, field.clone()); - // Direct check should fail (bloom filter has i32) - assert!(!sbbf.check(&test_value), "Direct check should fail"); + // Check without coercion fails + assert!(!sbbf.check(&test_value)); - // ArrowSbbf check should succeed (coerces i16 to i32) - let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type(), parquet_type); - assert!( - arrow_sbbf.check(&test_value), - "ArrowSbbf check should succeed" - ); + // Arrow Int16 -> Parquet INT32 + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Int16, ParquetType::INT32); + assert!(arrow_sbbf.check(&test_value)); } #[test] - fn test_uint8_coercion() { - // UInt8 -> Parquet INT32 (zero extension changes bytes) + fn test_check_uint8() { let test_value = 30_u8; let array = Arc::new(UInt8Array::from(vec![10_u8, 20, test_value, 40, 50])); let field = Field::new("col", ArrowType::UInt8, false); - let (sbbf, parquet_type) = build_sbbf(array, field.clone()); + let sbbf = build_sbbf(array, field.clone()); - // Direct check should fail (bloom filter has i32) - assert!(!sbbf.check(&test_value), "Direct check should fail"); + // Check without coercion fails + assert!(!sbbf.check(&test_value)); - // ArrowSbbf check should succeed (coerces u8 to i32) - let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type(), parquet_type); - assert!( - arrow_sbbf.check(&test_value), - "ArrowSbbf check should succeed" - ); + // Arrow UInt8 -> Parquet INT32 + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::UInt8, ParquetType::INT32); + assert!(arrow_sbbf.check(&test_value)); } #[test] - fn test_uint16_coercion() { - // UInt16 -> Parquet INT32 (zero extension changes bytes) + fn test_check_uint16() { let test_value = 3000_u16; let array = Arc::new(UInt16Array::from(vec![ 1000_u16, 2000, test_value, 4000, 5000, ])); let field = Field::new("col", ArrowType::UInt16, false); - let (sbbf, parquet_type) = build_sbbf(array, field.clone()); + let sbbf = build_sbbf(array, field.clone()); - // Direct check should fail (bloom filter has i32) - assert!(!sbbf.check(&test_value), "Direct check should fail"); + // Check without coercion fails + assert!(!sbbf.check(&test_value)); - // ArrowSbbf check should succeed (coerces u16 to i32) - let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type(), parquet_type); - assert!( - arrow_sbbf.check(&test_value), - "ArrowSbbf check should succeed" - ); + // Arrow UInt16 -> Parquet INT32 + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::UInt16, ParquetType::INT32); + assert!(arrow_sbbf.check(&test_value)); } #[test] - fn test_uint32_no_coercion() { - // UInt32 -> Parquet INT32 (reinterpret cast preserves bit pattern) + fn test_check_uint32() { let test_value = 3_000_000_000_u32; // > i32::MAX let array = Arc::new(UInt32Array::from(vec![ 100_u32, @@ -445,22 +419,18 @@ mod tests { 4_000_000_000_u32, ])); let field = Field::new("col", ArrowType::UInt32, false); - let (sbbf, parquet_type) = build_sbbf(array, field.clone()); + let sbbf = build_sbbf(array, field.clone()); - // Direct check should succeed (bit pattern preserved) - assert!(sbbf.check(&test_value), "Direct check should succeed"); + // No coercion necessary + assert!(sbbf.check(&test_value)); - // ArrowSbbf check should also succeed - let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type(), parquet_type); - assert!( - arrow_sbbf.check(&test_value), - "ArrowSbbf check should succeed" - ); + // Arrow UInt32 -> Parquet INT32 + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::UInt16, ParquetType::INT32); + assert!(arrow_sbbf.check(&test_value)); } #[test] - fn test_uint64_no_coercion() { - // UInt64 -> Parquet INT64 (reinterpret cast preserves bit pattern) + fn test_check_uint64() { let test_value = 10_000_000_000_000_000_000_u64; // > i64::MAX let array = Arc::new(UInt64Array::from(vec![ 100_u64, @@ -468,254 +438,113 @@ mod tests { 15_000_000_000_000_000_000_u64, ])); let field = Field::new("col", ArrowType::UInt64, false); - let (sbbf, parquet_type) = build_sbbf(array, field.clone()); + let sbbf = build_sbbf(array, field.clone()); - // Direct check should succeed (bit pattern preserved) - assert!(sbbf.check(&test_value), "Direct check should succeed"); + // No coercion necessary + assert!(sbbf.check(&test_value)); - // ArrowSbbf check should also succeed - let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type(), parquet_type); - assert!( - arrow_sbbf.check(&test_value), - "ArrowSbbf check should succeed" - ); + // Arrow UInt64 -> Parquet INT64 + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::UInt64, ParquetType::INT64); + assert!(arrow_sbbf.check(&test_value)); } #[test] - fn test_decimal128_small_coercion() { - // Decimal128(5, 2) -> Parquet INT32 (precision 1-9, truncation changes bytes) + fn test_check_decimal128_small() { let test_value = 20075_i128; let array = Decimal128Array::from(vec![10050_i128, test_value, 30099_i128]) .with_precision_and_scale(5, 2) .unwrap(); let field = Field::new("col", ArrowType::Decimal128(5, 2), false); - let schema = Arc::new(Schema::new(vec![field.clone()])); - let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); - - // Write with bloom filter - let mut file = tempfile().unwrap(); - let props = WriterProperties::builder() - .set_bloom_filter_enabled(true) - .build(); - let mut writer = ArrowWriter::try_new(&mut file, schema, Some(props)).unwrap(); - writer.write(&batch).unwrap(); - writer.close().unwrap(); - - // Read back - let options = ReadOptionsBuilder::new() - .with_reader_properties( - ReaderProperties::builder() - .set_read_bloom_filter(true) - .build(), - ) - .build(); - let reader = SerializedFileReader::new_with_options(file, options).unwrap(); - let row_group_reader = reader.get_row_group(0).unwrap(); - let metadata = row_group_reader.metadata(); - let column_chunk = metadata.column(0); - let parquet_type = column_chunk.column_type(); - let bloom_filter = row_group_reader - .get_column_bloom_filter(0) - .expect("Bloom filter should exist"); + let sbbf = build_sbbf(Arc::new(array), field.clone()); - // Test 1: Direct check with i128 bytes should fail (bloom filter has i32) + // Check without coercion fails let test_bytes = test_value.to_le_bytes(); - let direct_result = bloom_filter.check(&test_bytes[..]); - println!( - "Decimal128(5,2): Direct Sbbf check with i128 bytes = {}", - direct_result - ); - assert!( - !direct_result, - "Direct check with i128 bytes should fail (bloom filter has i32)" - ); + let direct_result = sbbf.check(&test_bytes[..]); + assert!(!direct_result); - // Test 2: ArrowSbbf check should succeed (coerces i128 to i32) - let arrow_sbbf = ArrowSbbf::new(&bloom_filter, field.data_type(), parquet_type); + // Arrow Decimal128(5, 2) -> Parquet INT32 + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Decimal128(5, 2), ParquetType::INT32); let arrow_result = arrow_sbbf.check(&test_bytes[..]); - println!( - "Decimal128(5,2): ArrowSbbf check with i128 bytes = {}", - arrow_result - ); - assert!( - arrow_result, - "ArrowSbbf check should succeed (coerces to i32)" - ); + assert!(arrow_result); } #[test] - fn test_decimal128_medium_coercion() { - // Decimal128(15, 2) -> Parquet INT64 (precision 10-18, truncation changes bytes) - // Direct Sbbf check: false (bug - checking i128 but filter has i64) - // ArrowSbbf check: true (fix - coerces i128 to i64) + fn test_check_decimal128_medium() { let test_value = 9876543210987_i128; let array = Decimal128Array::from(vec![1234567890123_i128, test_value, 5555555555555_i128]) .with_precision_and_scale(15, 2) .unwrap(); let field = Field::new("col", ArrowType::Decimal128(15, 2), false); - let schema = Arc::new(Schema::new(vec![field.clone()])); - let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); - - // Write with bloom filter - let mut file = tempfile().unwrap(); - let props = WriterProperties::builder() - .set_bloom_filter_enabled(true) - .build(); - let mut writer = ArrowWriter::try_new(&mut file, schema, Some(props)).unwrap(); - writer.write(&batch).unwrap(); - writer.close().unwrap(); + let sbbf = build_sbbf(Arc::new(array), field.clone()); - // Read back - let options = ReadOptionsBuilder::new() - .with_reader_properties( - ReaderProperties::builder() - .set_read_bloom_filter(true) - .build(), - ) - .build(); - let reader = SerializedFileReader::new_with_options(file, options).unwrap(); - let row_group_reader = reader.get_row_group(0).unwrap(); - let metadata = row_group_reader.metadata(); - let column_chunk = metadata.column(0); - let parquet_type = column_chunk.column_type(); - let bloom_filter = row_group_reader - .get_column_bloom_filter(0) - .expect("Bloom filter should exist"); - - // Test 1: Direct check with i128 bytes should fail (bloom filter has i64) + // Check without coercion fails let test_bytes = test_value.to_le_bytes(); - let direct_result = bloom_filter.check(&test_bytes[..]); - println!( - "Decimal128(15,2): Direct Sbbf check with i128 bytes = {}", - direct_result - ); - assert!( - !direct_result, - "Direct check with i128 bytes should fail (bloom filter has i64)" - ); + let direct_result = sbbf.check(&test_bytes[..]); + assert!(!direct_result); - // Test 2: ArrowSbbf check should succeed (coerces i128 to i64) - let arrow_sbbf = ArrowSbbf::new(&bloom_filter, field.data_type(), parquet_type); + // Arrow Decimal128(15, 2) -> Parquet INT64 + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Decimal128(15, 2), ParquetType::INT64); let arrow_result = arrow_sbbf.check(&test_bytes[..]); - println!( - "Decimal128(15,2): ArrowSbbf check with i128 bytes = {}", - arrow_result - ); - assert!( - arrow_result, - "ArrowSbbf check should succeed (coerces to i64)" - ); + assert!(arrow_result); } #[test] - fn test_decimal32_no_coercion() { - // Decimal32(5, 2) -> Parquet INT32 (reinterpret cast preserves bit pattern for small precision) - // Direct Sbbf check: true (works without ArrowSbbf for Decimal32) - // ArrowSbbf check: true (wrapper doesn't hurt) - // Note: Decimal32 stores i32 natively, so no truncation occurs + fn test_check_decimal32() { let test_value = 20075_i32; let array = Decimal32Array::from(vec![10050_i32, test_value, 30099_i32]) .with_precision_and_scale(5, 2) .unwrap(); let field = Field::new("col", ArrowType::Decimal32(5, 2), false); - let (sbbf, parquet_type) = build_sbbf(Arc::new(array), field.clone()); + let sbbf = build_sbbf(Arc::new(array), field.clone()); - // Direct check should succeed (bit pattern preserved) - assert!(sbbf.check(&test_value), "Direct check should succeed"); + // No coercion necessary + assert!(sbbf.check(&test_value)); - // ArrowSbbf check should also succeed - let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type(), parquet_type); - assert!( - arrow_sbbf.check(&test_value), - "ArrowSbbf check should succeed" - ); + // Arrow Decimal32(5, 2) -> Parquet INT32 + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Decimal32(5, 2), ParquetType::INT32); + assert!(arrow_sbbf.check(&test_value)); } #[test] - fn test_float16_no_coercion() { - // Float16 -> Parquet FIXED_LEN_BYTE_ARRAY(2) (stored as-is) - // Direct Sbbf check: true (bit pattern preserved) - // ArrowSbbf check: true (wrapper doesn't hurt) + fn test_check_float16() { use half::f16; let test_value = f16::from_f32(2.5); let array = Float16Array::from(vec![f16::from_f32(1.5), test_value, f16::from_f32(3.5)]); let field = Field::new("col", ArrowType::Float16, false); - let schema = Arc::new(Schema::new(vec![field.clone()])); - let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); + let sbbf = build_sbbf(Arc::new(array), field.clone()); - // Write with bloom filter - let mut file = tempfile().unwrap(); - let props = WriterProperties::builder() - .set_bloom_filter_enabled(true) - .build(); - let mut writer = ArrowWriter::try_new(&mut file, schema, Some(props)).unwrap(); - writer.write(&batch).unwrap(); - writer.close().unwrap(); - - // Read back - let options = ReadOptionsBuilder::new() - .with_reader_properties( - ReaderProperties::builder() - .set_read_bloom_filter(true) - .build(), - ) - .build(); - let reader = SerializedFileReader::new_with_options(file, options).unwrap(); - let row_group_reader = reader.get_row_group(0).unwrap(); - let metadata = row_group_reader.metadata(); - let column_chunk = metadata.column(0); - let parquet_type = column_chunk.column_type(); - let bloom_filter = row_group_reader - .get_column_bloom_filter(0) - .expect("Bloom filter should exist"); - - // Test 1: Direct check with f16 bytes should work (bit pattern preserved) + // No coercion necessary let test_bytes = test_value.to_le_bytes(); - let direct_result = bloom_filter.check(&test_bytes[..]); - println!("Float16: Direct Sbbf check with bytes = {}", direct_result); - assert!(direct_result, "Direct check with bytes should succeed"); - - // Test 2: ArrowSbbf check should also work - let arrow_sbbf = ArrowSbbf::new(&bloom_filter, field.data_type(), parquet_type); + let direct_result = sbbf.check(&test_bytes[..]); + assert!(direct_result); + + // Arrow Float16 -> Parquet FIXED_LEN_BYTE_ARRAY + let arrow_sbbf = ArrowSbbf::new( + &sbbf, + &ArrowType::Float16, + ParquetType::FIXED_LEN_BYTE_ARRAY, + ); let arrow_result = arrow_sbbf.check(&test_bytes[..]); - println!("Float16: ArrowSbbf check with bytes = {}", arrow_result); - assert!(arrow_result, "ArrowSbbf check should succeed"); + assert!(arrow_result); } #[test] - fn test_date64_no_coercion_by_default() { - // Date64 -> Parquet INT64 (stored as-is when coerce_types=false, which is default) - // Direct Sbbf check: true (no coercion by default) - // ArrowSbbf check: true (wrapper doesn't hurt) - // Note: Date64 CAN be coerced to Date32/INT32 with coerce_types=true, - // but that's not the default behavior, so bloom filters work correctly by default + fn test_check_date64_default() { let test_value = 2_000_000_000_i64; let array = Date64Array::from(vec![1_000_000_000_i64, test_value, 3_000_000_000_i64]); let field = Field::new("col", ArrowType::Date64, false); - let (sbbf, parquet_type) = build_sbbf(Arc::new(array), field.clone()); + let sbbf = build_sbbf(Arc::new(array), field.clone()); - // Direct check should succeed (no coercion by default) - assert!(sbbf.check(&test_value), "Direct check should succeed"); + // No coercion necessary + assert!(sbbf.check(&test_value)); - // ArrowSbbf check should also succeed - let arrow_sbbf = ArrowSbbf::new(&sbbf, field.data_type(), parquet_type); - assert!( - arrow_sbbf.check(&test_value), - "ArrowSbbf check should succeed" - ); + // Arrow Date64 -> Parquet INT64 + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Date64, ParquetType::INT64); + assert!(arrow_sbbf.check(&test_value)); } #[test] - fn test_date64_with_coerce_types() { - // This test verifies that ArrowSbbf correctly handles Date64 when coerce_types=true. - // - // When WriterProperties::set_coerce_types(true) is used: - // - Date64 (i64 milliseconds) -> Date32 (i32 days) -> Parquet INT32 - // - Conversion: milliseconds / 86_400_000 = days - // - // ArrowSbbf detects this by inspecting the physical type (INT32 vs INT64) - // and automatically performs the milliseconds-to-days conversion. - + fn test_check_date64_with_coerce_types() { const MS_PER_DAY: i64 = 86_400_000; let test_value_ms = 10 * MS_PER_DAY; // 10 days in milliseconds let array = Date64Array::from(vec![5 * MS_PER_DAY, test_value_ms, 15 * MS_PER_DAY]); @@ -724,39 +553,17 @@ mod tests { // Write with coerce_types enabled let props = WriterProperties::builder() .set_bloom_filter_enabled(true) - .set_coerce_types(true) // This causes Date64 -> Date32 (INT32) + .set_coerce_types(true) // causes coercion from Arrow Date64 -> Parquet INT32 .build(); - let (bloom_filter, parquet_type) = - build_sbbf_with_props(Arc::new(array), field.clone(), props); - - // Verify physical type is INT32 (not INT64) - assert_eq!( - parquet_type, - ParquetType::INT32, - "Date64 with coerce_types=true should be stored as INT32" - ); + let sbbf = build_sbbf_with_props(Arc::new(array), field.clone(), props); - // Test 1: Direct check with i64 milliseconds fails (bloom filter has i32 days) - let direct_result = bloom_filter.check(&test_value_ms); - assert!( - !direct_result, - "Direct check with i64 ms should fail (bloom filter has i32 days)" - ); + // Check without coercion fails + let direct_result = sbbf.check(&test_value_ms); + assert!(!direct_result); - // Test 2: ArrowSbbf succeeds! It detects INT32 physical type and converts ms->days - let arrow_sbbf = ArrowSbbf::new(&bloom_filter, field.data_type(), parquet_type); + // Arrow Date64 -> Parquet INT32 + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Date64, ParquetType::INT32); let arrow_result = arrow_sbbf.check(&test_value_ms); - assert!( - arrow_result, - "ArrowSbbf should handle Date64 coercion automatically" - ); - - // Test 3: Manual conversion also works - let days = (test_value_ms / MS_PER_DAY) as i32; - let manual_result = bloom_filter.check(&days); - assert!( - manual_result, - "Manual conversion to i32 days should succeed" - ); + assert!(arrow_result); } } From dcd61074ab160dea2beecd0e3d465bbad2e83405 Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Fri, 3 Oct 2025 12:46:35 -0700 Subject: [PATCH 05/17] feat(parquet): arrow sbbf date type handling --- parquet/benches/bloom_filter.rs | 34 +++---- parquet/src/arrow/bloom_filter.rs | 162 +++++++++++++----------------- 2 files changed, 85 insertions(+), 111 deletions(-) diff --git a/parquet/benches/bloom_filter.rs b/parquet/benches/bloom_filter.rs index 6eb86c7007a2..21998a03c9f7 100644 --- a/parquet/benches/bloom_filter.rs +++ b/parquet/benches/bloom_filter.rs @@ -20,7 +20,6 @@ use arrow_schema::{DataType, Field, Schema}; use criterion::*; use parquet::arrow::arrow_writer::ArrowWriter; use parquet::arrow::bloom_filter::ArrowSbbf; -use parquet::basic::Type as ParquetType; use parquet::bloom_filter::Sbbf; use parquet::file::properties::{ReaderProperties, WriterProperties}; use parquet::file::reader::{FileReader, SerializedFileReader}; @@ -32,8 +31,8 @@ use tempfile::tempfile; /// Helper function to create an Sbbf from an array for benchmarking /// /// Writes the given array to a Parquet file with bloom filters enabled, -/// then reads it back and returns the bloom filter and physical type for the first column. -fn setup_sbbf(array: ArrayRef, field: Field) -> (Sbbf, ParquetType) { +/// then reads it back and returns the bloom filter for the first column. +fn setup_sbbf(array: ArrayRef, field: Field) -> Sbbf { let schema = Arc::new(Schema::new(vec![field.clone()])); let batch = RecordBatch::try_new(schema.clone(), vec![array]).unwrap(); @@ -56,32 +55,27 @@ fn setup_sbbf(array: ArrayRef, field: Field) -> (Sbbf, ParquetType) { .build(); let reader = SerializedFileReader::new_with_options(file, options).unwrap(); let row_group_reader = reader.get_row_group(0).unwrap(); - let metadata = row_group_reader.metadata(); - let column_chunk = metadata.column(0); - let parquet_type = column_chunk.column_type(); - let sbbf = row_group_reader + row_group_reader .get_column_bloom_filter(0) .expect("Bloom filter should exist") - .clone(); - - (sbbf, parquet_type) + .clone() } fn bench_integer_types(c: &mut Criterion) { // Setup for Int8 benchmarks (requires coercion to i32) let int8_array = Arc::new(Int8Array::from(vec![42i8; 1000])) as ArrayRef; let int8_field = Field::new("col", DataType::Int8, false); - let (sbbf_int8, physical_type_int8) = setup_sbbf(int8_array, int8_field); - let arrow_sbbf_int8 = ArrowSbbf::new(&sbbf_int8, &DataType::Int8, physical_type_int8); + let sbbf_int8 = setup_sbbf(int8_array, int8_field); + let arrow_sbbf_int8 = ArrowSbbf::new(&sbbf_int8, &DataType::Int8); let test_val_i32 = 42i32; let test_val_i8 = 42i8; // Setup for Int32 benchmarks (no coercion needed) let int32_array = Arc::new(Int32Array::from(vec![42i32; 1000])) as ArrayRef; let int32_field = Field::new("col", DataType::Int32, false); - let (sbbf_int32, physical_type_int32) = setup_sbbf(int32_array, int32_field); - let arrow_sbbf_int32 = ArrowSbbf::new(&sbbf_int32, &DataType::Int32, physical_type_int32); + let sbbf_int32 = setup_sbbf(int32_array, int32_field); + let arrow_sbbf_int32 = ArrowSbbf::new(&sbbf_int32, &DataType::Int32); // Benchmark 1: Direct Sbbf::check with i32 (baseline) c.bench_function("Sbbf::check i32", |b| { @@ -116,8 +110,8 @@ fn bench_decimal_types(c: &mut Criterion) { .unwrap(), ) as ArrayRef; let dec_small_field = Field::new("col", DataType::Decimal128(5, 2), false); - let (sbbf_dec_small, physical_type_dec_small) = setup_sbbf(dec_small_array, dec_small_field); - let arrow_sbbf_dec_small = ArrowSbbf::new(&sbbf_dec_small, &DataType::Decimal128(5, 2), physical_type_dec_small); + let sbbf_dec_small = setup_sbbf(dec_small_array, dec_small_field); + let arrow_sbbf_dec_small = ArrowSbbf::new(&sbbf_dec_small, &DataType::Decimal128(5, 2)); let test_val_dec_small = 123456i128; // Setup for Decimal128 medium precision (coerces to i64) @@ -127,8 +121,8 @@ fn bench_decimal_types(c: &mut Criterion) { .unwrap(), ) as ArrayRef; let dec_medium_field = Field::new("col", DataType::Decimal128(15, 2), false); - let (sbbf_dec_medium, physical_type_dec_medium) = setup_sbbf(dec_medium_array, dec_medium_field); - let arrow_sbbf_dec_medium = ArrowSbbf::new(&sbbf_dec_medium, &DataType::Decimal128(15, 2), physical_type_dec_medium); + let sbbf_dec_medium = setup_sbbf(dec_medium_array, dec_medium_field); + let arrow_sbbf_dec_medium = ArrowSbbf::new(&sbbf_dec_medium, &DataType::Decimal128(15, 2)); let test_val_dec_medium = 123456789012345i128; // Setup for Decimal128 large precision (no coercion) @@ -138,8 +132,8 @@ fn bench_decimal_types(c: &mut Criterion) { .unwrap(), ) as ArrayRef; let dec_large_field = Field::new("col", DataType::Decimal128(30, 2), false); - let (sbbf_dec_large, physical_type_dec_large) = setup_sbbf(dec_large_array, dec_large_field); - let arrow_sbbf_dec_large = ArrowSbbf::new(&sbbf_dec_large, &DataType::Decimal128(30, 2), physical_type_dec_large); + let sbbf_dec_large = setup_sbbf(dec_large_array, dec_large_field); + let arrow_sbbf_dec_large = ArrowSbbf::new(&sbbf_dec_large, &DataType::Decimal128(30, 2)); let test_val_dec_large = 123456789012345678901234567890i128; // Benchmark 1: ArrowSbbf::check with Decimal128 small (coerce to i32) diff --git a/parquet/src/arrow/bloom_filter.rs b/parquet/src/arrow/bloom_filter.rs index 7b8a443f3770..1741c30488f8 100644 --- a/parquet/src/arrow/bloom_filter.rs +++ b/parquet/src/arrow/bloom_filter.rs @@ -25,19 +25,12 @@ //! Arrow supports types like `Int8` and `Int16` that have no corresponding Parquet type. //! Data of these types must be coerced to the appropriate Parquet type when writing. //! -//! For example, Arrow small integer types are coerced writing to Parquet: +//! For example, Arrow small integer types are coerced when writing to Parquet: //! - `Int8` → `INT32` (i8 → i32) //! - `Int16` → `INT32` (i16 → i32) //! - `UInt8` → `INT32` (u8 → u32 → i32) //! - `UInt16` → `INT32` (u16 → u32 → i32) //! -//! Decimal types with small precision are also coerced: -//! - `Decimal32/64/128/256(precision 1-9)` → `INT32` (truncated via `as i32`) -//! - `Decimal64/128/256(precision 10-18)` → `INT64` (truncated via `as i64`) -//! -//! In these situations, bloom filters store hashes of the *physical* Parquet values, but Arrow users -//! will often need to check using the *logical* Arrow values. -//! //! [`ArrowSbbf`] wraps an [`Sbbf`] and an Arrow [`DataType`], automatically coercing //! values to their Parquet representation before checking the bloom filter. //! @@ -47,13 +40,11 @@ //! use arrow_schema::DataType; //! use parquet::arrow::bloom_filter::ArrowSbbf; //! -//! // Get bloom filter and metadata from row group reader -//! let column_chunk = row_group_reader.metadata().column(0); +//! // Get bloom filter from row group reader //! let sbbf = row_group_reader.get_column_bloom_filter(0)?; -//! let parquet_type = column_chunk.column_type(); //! -//! // Create ArrowSbbf with both logical and physical types -//! let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Int8, parquet_type); +//! // Create ArrowSbbf with the logical Arrow type +//! let arrow_sbbf = ArrowSbbf::new(&sbbf, &DataType::Int8); //! //! // Check with i8 value - automatically coerced to i32 //! let value: i8 = 42; @@ -62,31 +53,59 @@ //! } //! ``` //! -//! # Date64 Handling +//! # Date64 with Type Coercion +//! +//! `Date64` requires special handling because it can be stored as either INT32 or INT64 +//! depending on [`WriterProperties::set_coerce_types`]. //! -//! [`ArrowSbbf`] correctly handles `Date64` values regardless of whether -//! [`WriterProperties::set_coerce_types`] was used: +//! When `coerce_types=true`, Date64 is stored as Date32 (INT32, days since epoch). +//! Users must convert their Date64 values before checking: +//! +//! ```ignore +//! use arrow_array::temporal_conversions::MILLISECONDS_IN_DAY; +//! use arrow_array::Date64Array; +//! use arrow_cast::cast; +//! use arrow_schema::DataType; //! -//! - When stored as INT64 (default): Values checked as-is (milliseconds since epoch) -//! - When stored as INT32 (coerce_types=true): Values automatically converted from milliseconds to days +//! let date64_value = 864_000_000_i64; // milliseconds //! -//! The physical type determines the correct conversion automatically. +//! // Check the column's physical type to determine conversion +//! let column_chunk = row_group_reader.metadata().column(0); +//! match column_chunk.column_type() { +//! ParquetType::INT32 => { +//! // Date64 was coerced to Date32 - convert milliseconds to days +//! let date32_value = (date64_value / MILLISECONDS_IN_DAY) as i32; +//! let arrow_sbbf = ArrowSbbf::new(&sbbf, &DataType::Date32); +//! arrow_sbbf.check(&date32_value) +//! } +//! ParquetType::INT64 => { +//! // Date64 stored as-is - check directly +//! let arrow_sbbf = ArrowSbbf::new(&sbbf, &DataType::Date64); +//! arrow_sbbf.check(&date64_value) +//! } +//! _ => unreachable!() +//! } +//! ``` +//! +//! Alternatively, use [`arrow_cast::cast`] for the conversion: +//! ```ignore +//! let date64_array = Date64Array::from(vec![date64_value]); +//! let date32_array = cast(&date64_array, &DataType::Date32)?; +//! let date32_value = date32_array.as_primitive::().value(0); +//! ``` //! //! [`WriterProperties::set_coerce_types`]: crate::file::properties::WriterPropertiesBuilder::set_coerce_types +//! [`arrow_cast::cast`]: https://docs.rs/arrow-cast/latest/arrow_cast/cast/fn.cast.html -use crate::basic::Type as ParquetType; use crate::bloom_filter::Sbbf; use crate::data_type::AsBytes; use arrow_schema::DataType as ArrowType; /// Wraps an [`Sbbf`] and provides automatic type coercion based on Arrow schema. -/// Ensures that checking bloom filters works correctly for Arrow types that require -/// coercion to Parquet physical types (e.g., Int8 → INT32). #[derive(Debug, Clone)] pub struct ArrowSbbf<'a> { sbbf: &'a Sbbf, arrow_type: &'a ArrowType, - parquet_type: ParquetType, } impl<'a> ArrowSbbf<'a> { @@ -94,18 +113,9 @@ impl<'a> ArrowSbbf<'a> { /// /// # Arguments /// * `sbbf` - Parquet bloom filter for the column - /// * `arrow_type` - Arrow data type for the column (logical type) - /// * `parquet_type` - Parquet physical type for the column - /// - /// The Parquet type can be obtained from [`ColumnChunkMetaData::column_type()`]. - /// - /// [`ColumnChunkMetaData::column_type()`]: crate::file::metadata::ColumnChunkMetaData::column_type - pub fn new(sbbf: &'a Sbbf, arrow_type: &'a ArrowType, parquet_type: ParquetType) -> Self { - Self { - sbbf, - arrow_type, - parquet_type, - } + /// * `arrow_type` - Arrow data type for the column + pub fn new(sbbf: &'a Sbbf, arrow_type: &'a ArrowType) -> Self { + Self { sbbf, arrow_type } } /// Check if a value might be present in the bloom filter @@ -254,36 +264,6 @@ impl<'a> ArrowSbbf<'a> { _ => self.sbbf.check(value), } } - ArrowType::Date64 => { - // Date64 can be stored as either INT32 or INT64 depending on coerce_types - let bytes = value.as_bytes(); - if bytes.len() == 8 { - let i64_val = i64::from_le_bytes([ - bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], - bytes[7], - ]); - match self.parquet_type { - ParquetType::INT32 => { - // Date64 was coerced to Date32 (days since epoch) - // Convert milliseconds to days: ms / 86_400_000 - const MS_PER_DAY: i64 = 86_400_000; - let days = (i64_val / MS_PER_DAY) as i32; - self.sbbf.check(&days) - } - ParquetType::INT64 => { - // Date64 stored as-is (milliseconds since epoch) - self.sbbf.check(&i64_val) - } - _ => { - // Unexpected physical type for Date64, fall back to direct check - self.sbbf.check(value) - } - } - } else { - // Unexpected size, fall back to direct check - self.sbbf.check(value) - } - } // No coercion needed _ => self.sbbf.check(value), } @@ -358,8 +338,8 @@ mod tests { // Check without coercion fails assert!(!sbbf.check(&test_value)); - // Arrow Int8 -> ParquetType::INT32 - let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Int8, ParquetType::INT32); + // Arrow Int8 -> Parquet INT32 + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Int8); assert!(arrow_sbbf.check(&test_value)); } @@ -374,7 +354,7 @@ mod tests { assert!(!sbbf.check(&test_value)); // Arrow Int16 -> Parquet INT32 - let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Int16, ParquetType::INT32); + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Int16); assert!(arrow_sbbf.check(&test_value)); } @@ -389,7 +369,7 @@ mod tests { assert!(!sbbf.check(&test_value)); // Arrow UInt8 -> Parquet INT32 - let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::UInt8, ParquetType::INT32); + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::UInt8); assert!(arrow_sbbf.check(&test_value)); } @@ -406,7 +386,7 @@ mod tests { assert!(!sbbf.check(&test_value)); // Arrow UInt16 -> Parquet INT32 - let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::UInt16, ParquetType::INT32); + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::UInt16); assert!(arrow_sbbf.check(&test_value)); } @@ -425,7 +405,7 @@ mod tests { assert!(sbbf.check(&test_value)); // Arrow UInt32 -> Parquet INT32 - let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::UInt16, ParquetType::INT32); + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::UInt32); assert!(arrow_sbbf.check(&test_value)); } @@ -444,7 +424,7 @@ mod tests { assert!(sbbf.check(&test_value)); // Arrow UInt64 -> Parquet INT64 - let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::UInt64, ParquetType::INT64); + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::UInt64); assert!(arrow_sbbf.check(&test_value)); } @@ -463,7 +443,7 @@ mod tests { assert!(!direct_result); // Arrow Decimal128(5, 2) -> Parquet INT32 - let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Decimal128(5, 2), ParquetType::INT32); + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Decimal128(5, 2)); let arrow_result = arrow_sbbf.check(&test_bytes[..]); assert!(arrow_result); } @@ -483,7 +463,7 @@ mod tests { assert!(!direct_result); // Arrow Decimal128(15, 2) -> Parquet INT64 - let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Decimal128(15, 2), ParquetType::INT64); + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Decimal128(15, 2)); let arrow_result = arrow_sbbf.check(&test_bytes[..]); assert!(arrow_result); } @@ -501,7 +481,7 @@ mod tests { assert!(sbbf.check(&test_value)); // Arrow Decimal32(5, 2) -> Parquet INT32 - let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Decimal32(5, 2), ParquetType::INT32); + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Decimal32(5, 2)); assert!(arrow_sbbf.check(&test_value)); } @@ -519,11 +499,7 @@ mod tests { assert!(direct_result); // Arrow Float16 -> Parquet FIXED_LEN_BYTE_ARRAY - let arrow_sbbf = ArrowSbbf::new( - &sbbf, - &ArrowType::Float16, - ParquetType::FIXED_LEN_BYTE_ARRAY, - ); + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Float16); let arrow_result = arrow_sbbf.check(&test_bytes[..]); assert!(arrow_result); } @@ -539,31 +515,35 @@ mod tests { assert!(sbbf.check(&test_value)); // Arrow Date64 -> Parquet INT64 - let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Date64, ParquetType::INT64); + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Date64); assert!(arrow_sbbf.check(&test_value)); } #[test] fn test_check_date64_with_coerce_types() { - const MS_PER_DAY: i64 = 86_400_000; - let test_value_ms = 10 * MS_PER_DAY; // 10 days in milliseconds - let array = Date64Array::from(vec![5 * MS_PER_DAY, test_value_ms, 15 * MS_PER_DAY]); + use arrow_array::temporal_conversions::MILLISECONDS_IN_DAY; + + let test_value_ms = 10 * MILLISECONDS_IN_DAY; // 10 days in milliseconds + let array = Date64Array::from(vec![ + 5 * MILLISECONDS_IN_DAY, + test_value_ms, + 15 * MILLISECONDS_IN_DAY, + ]); let field = Field::new("col", ArrowType::Date64, false); // Write with coerce_types enabled let props = WriterProperties::builder() .set_bloom_filter_enabled(true) - .set_coerce_types(true) // causes coercion from Arrow Date64 -> Parquet INT32 + .set_coerce_types(true) .build(); let sbbf = build_sbbf_with_props(Arc::new(array), field.clone(), props); // Check without coercion fails - let direct_result = sbbf.check(&test_value_ms); - assert!(!direct_result); + assert!(!sbbf.check(&test_value_ms)); - // Arrow Date64 -> Parquet INT32 - let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Date64, ParquetType::INT32); - let arrow_result = arrow_sbbf.check(&test_value_ms); - assert!(arrow_result); + // Arrow Date64 -> Arrow Date32 -> Parquet INT32 + let date32_value = (test_value_ms / MILLISECONDS_IN_DAY) as i32; + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Date32); // Note: Date32, not Date64 + assert!(arrow_sbbf.check(&date32_value)); } } From 7e5cdc3652b0790fb016db933f1e7f86e619b0f4 Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Fri, 3 Oct 2025 14:18:14 -0700 Subject: [PATCH 06/17] fix(parquet): arrow sbbf malformed input handling --- parquet/src/arrow/bloom_filter.rs | 276 ++++++++++++++++++++---------- 1 file changed, 184 insertions(+), 92 deletions(-) diff --git a/parquet/src/arrow/bloom_filter.rs b/parquet/src/arrow/bloom_filter.rs index 1741c30488f8..7fa0fd5fed98 100644 --- a/parquet/src/arrow/bloom_filter.rs +++ b/parquet/src/arrow/bloom_filter.rs @@ -128,143 +128,235 @@ impl<'a> ArrowSbbf<'a> { ArrowType::Int8 => { // Arrow Int8 -> Parquet INT32 let bytes = value.as_bytes(); + + debug_assert!( + bytes.len() == 1, + "ArrowSbbf: expected 1 byte for Int8, got {}. This indicates a type mismatch.", + bytes.len() + ); + if bytes.len() == 1 { let i8_val = i8::from_le_bytes([bytes[0]]); let i32_val = i8_val as i32; self.sbbf.check(&i32_val) } else { - // Unexpected size, fall back to direct check - self.sbbf.check(value) + true // Unexpected byte length, return false positive } } ArrowType::Int16 => { // Arrow Int16 -> Parquet INT32 let bytes = value.as_bytes(); + + debug_assert!( + bytes.len() == 2, + "ArrowSbbf: expected 2 bytes for Int16, got {}. This indicates a type mismatch.", + bytes.len() + ); + if bytes.len() == 2 { let i16_val = i16::from_le_bytes([bytes[0], bytes[1]]); let i32_val = i16_val as i32; self.sbbf.check(&i32_val) } else { - // Unexpected size, fall back to direct check - self.sbbf.check(value) + true // Unexpected byte length, return false positive } } ArrowType::UInt8 => { // Arrow UInt8 -> Parquet INT32 let bytes = value.as_bytes(); + + debug_assert!( + bytes.len() == 1, + "ArrowSbbf: expected 1 byte for UInt8, got {}. This indicates a type mismatch.", + bytes.len() + ); + if bytes.len() == 1 { let u8_val = bytes[0]; let u32_val = u8_val as u32; let i32_val = u32_val as i32; self.sbbf.check(&i32_val) } else { - // Unexpected size, fall back to direct check - self.sbbf.check(value) + true // Unexpected byte length, return false positive } } ArrowType::UInt16 => { // Arrow UInt16 -> Parquet INT32 let bytes = value.as_bytes(); + + debug_assert!( + bytes.len() == 2, + "ArrowSbbf: expected 2 bytes for UInt16, got {}. This indicates a type mismatch.", + bytes.len() + ); + if bytes.len() == 2 { let u16_val = u16::from_le_bytes([bytes[0], bytes[1]]); let u32_val = u16_val as u32; let i32_val = u32_val as i32; self.sbbf.check(&i32_val) } else { - // Unexpected size, fall back to direct check - self.sbbf.check(value) + true // Unexpected byte length, return false positive + } + } + ArrowType::Decimal32(precision, _) if *precision >= 1 && *precision <= 9 => { + // Arrow Decimal32 (precision 1-9) -> Parquet INT32 + let bytes = value.as_bytes(); + + debug_assert_eq!( + bytes.len(), + 4, + "ArrowSbbf: expected 4 bytes for Decimal32, got {}. This indicates a type mismatch.", + bytes.len() + ); + + if bytes.len() == 4 { + let i32_val = i32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]); + self.sbbf.check(&i32_val) + } else { + true // Unexpected byte length, return false positive + } + } + ArrowType::Decimal64(precision, _) if *precision >= 1 && *precision <= 9 => { + // Arrow Decimal64 (precision 1-9) -> Parquet INT32 + let bytes = value.as_bytes(); + + debug_assert_eq!( + bytes.len(), + 8, + "ArrowSbbf: expected 8 bytes for Decimal64, got {}. This indicates a type mismatch.", + bytes.len() + ); + + if bytes.len() == 8 { + let i64_val = i64::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], + bytes[7], + ]); + let i32_val = i64_val as i32; + self.sbbf.check(&i32_val) + } else { + true // Unexpected byte length, return false positive } } - ArrowType::Decimal32(precision, _) - | ArrowType::Decimal64(precision, _) - | ArrowType::Decimal128(precision, _) - | ArrowType::Decimal256(precision, _) - if *precision >= 1 && *precision <= 9 => - { - // Decimal with precision 1-9 -> Parquet INT32 - // Writer truncates via `as i32` + ArrowType::Decimal128(precision, _) if *precision >= 1 && *precision <= 9 => { + // Arrow Decimal128 (precision 1-9) -> Parquet INT32 let bytes = value.as_bytes(); - match bytes.len() { - 4 => { - // Decimal32: i32 value, directly reinterpret - let i32_val = i32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]); - self.sbbf.check(&i32_val) - } - 8 => { - // Decimal64: i64 value, truncate to i32 - let i64_val = i64::from_le_bytes([ - bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], - bytes[7], - ]); - let i32_val = i64_val as i32; - self.sbbf.check(&i32_val) - } - 16 => { - // Decimal128: i128 value, truncate to i32 - let i128_val = i128::from_le_bytes([ - bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], - bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], - bytes[13], bytes[14], bytes[15], - ]); - let i32_val = i128_val as i32; - self.sbbf.check(&i32_val) - } - 32 => { - // Decimal256: i256 stored as 32 bytes, truncate to i32 - // Read first 16 bytes as i128, then truncate - let i128_val = i128::from_le_bytes([ - bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], - bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], - bytes[13], bytes[14], bytes[15], - ]); - let i32_val = i128_val as i32; - self.sbbf.check(&i32_val) - } - _ => self.sbbf.check(value), + + debug_assert_eq!( + bytes.len(), + 16, + "ArrowSbbf: expected 16 bytes for Decimal128, got {}. This indicates a type mismatch.", + bytes.len() + ); + + if bytes.len() == 16 { + let i128_val = i128::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], + bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], + bytes[14], bytes[15], + ]); + let i32_val = i128_val as i32; + self.sbbf.check(&i32_val) + } else { + true // Unexpected byte length, return false positive } } - ArrowType::Decimal64(precision, _) - | ArrowType::Decimal128(precision, _) - | ArrowType::Decimal256(precision, _) - if *precision >= 10 && *precision <= 18 => - { - // Decimal with precision 10-18 -> Parquet INT64 - // Writer truncates via `as i64` + ArrowType::Decimal256(precision, _) if *precision >= 1 && *precision <= 9 => { + // Arrow Decimal256 (precision 1-9) -> Parquet INT32 let bytes = value.as_bytes(); - match bytes.len() { - 8 => { - // Decimal64: i64 value, directly use - let i64_val = i64::from_le_bytes([ - bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], - bytes[7], - ]); - self.sbbf.check(&i64_val) - } - 16 => { - // Decimal128: i128 value, truncate to i64 - let i128_val = i128::from_le_bytes([ - bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], - bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], - bytes[13], bytes[14], bytes[15], - ]); - let i64_val = i128_val as i64; - self.sbbf.check(&i64_val) - } - 32 => { - // Decimal256: i256 stored as 32 bytes, truncate to i64 - // Read first 16 bytes as i128, then truncate - let i128_val = i128::from_le_bytes([ - bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], - bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], - bytes[13], bytes[14], bytes[15], - ]); - let i64_val = i128_val as i64; - self.sbbf.check(&i64_val) - } - _ => self.sbbf.check(value), + + debug_assert_eq!( + bytes.len(), + 32, + "ArrowSbbf: expected 32 bytes for Decimal256, got {}. This indicates a type mismatch.", + bytes.len() + ); + + if bytes.len() == 32 { + // Decimal256: i256 stored as 32 bytes, truncate to i32 + // Read first 16 bytes as i128, then truncate + let i128_val = i128::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], + bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], + bytes[14], bytes[15], + ]); + let i32_val = i128_val as i32; + self.sbbf.check(&i32_val) + } else { + true // Unexpected byte length, return false positive + } + } + ArrowType::Decimal64(precision, _) if *precision >= 10 && *precision <= 18 => { + // Arrow Decimal64 (precision 10-18) -> Parquet INT64 + let bytes = value.as_bytes(); + + debug_assert_eq!( + bytes.len(), + 8, + "ArrowSbbf: expected 8 bytes for Decimal64, got {}. This indicates a type mismatch.", + bytes.len() + ); + + if bytes.len() == 8 { + let i64_val = i64::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], + bytes[7], + ]); + self.sbbf.check(&i64_val) + } else { + true // Unexpected byte length, return false positive + } + } + ArrowType::Decimal128(precision, _) if *precision >= 10 && *precision <= 18 => { + // Arrow Decimal128 (precision 10-18) -> Parquet INT64 + let bytes = value.as_bytes(); + + debug_assert_eq!( + bytes.len(), + 16, + "ArrowSbbf: expected 16 bytes for Decimal128, got {}. This indicates a type mismatch.", + bytes.len() + ); + + if bytes.len() == 16 { + // Read first 16 bytes as i128, then truncate to i64 + let i128_val = i128::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], + bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], + bytes[14], bytes[15], + ]); + let i64_val = i128_val as i64; + self.sbbf.check(&i64_val) + } else { + true // Unexpected byte length, return false positive + } + } + ArrowType::Decimal256(precision, _) if *precision >= 10 && *precision <= 18 => { + // Arrow Decimal256 (precision 10-18) -> Parquet INT64 + let bytes = value.as_bytes(); + + debug_assert_eq!( + bytes.len(), + 32, + "ArrowSbbf: expected 32 bytes for Decimal256, got {}. This indicates a type mismatch.", + bytes.len() + ); + + if bytes.len() == 32 { + // Read first 16 bytes as i128, then truncate to i64 + let i128_val = i128::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], + bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], + bytes[14], bytes[15], + ]); + let i64_val = i128_val as i64; + self.sbbf.check(&i64_val) + } else { + true // Unexpected byte length, return false positive } } - // No coercion needed + // No coercion necessary _ => self.sbbf.check(value), } } From 3ec2a94e24a9e7adb690cc0cbbff97da39ddd9e1 Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Fri, 3 Oct 2025 17:22:31 -0700 Subject: [PATCH 07/17] test(parquet): arrow sbbf decimal256 coercion --- parquet/src/arrow/bloom_filter.rs | 59 ++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/bloom_filter.rs b/parquet/src/arrow/bloom_filter.rs index 7fa0fd5fed98..dd30dd8a0b2b 100644 --- a/parquet/src/arrow/bloom_filter.rs +++ b/parquet/src/arrow/bloom_filter.rs @@ -274,8 +274,7 @@ impl<'a> ArrowSbbf<'a> { ); if bytes.len() == 32 { - // Decimal256: i256 stored as 32 bytes, truncate to i32 - // Read first 16 bytes as i128, then truncate + // Read first 16 bytes as i128, then truncate to i32 let i128_val = i128::from_le_bytes([ bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], @@ -370,8 +369,8 @@ mod tests { use crate::file::reader::{FileReader, SerializedFileReader}; use crate::file::serialized_reader::ReadOptionsBuilder; use arrow_array::{ - ArrayRef, Date64Array, Decimal128Array, Decimal32Array, Float16Array, Int16Array, - Int8Array, RecordBatch, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + ArrayRef, Date64Array, Decimal128Array, Decimal256Array, Decimal32Array, Float16Array, + Int16Array, Int8Array, RecordBatch, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow_schema::{Field, Schema}; use std::sync::Arc; @@ -560,6 +559,58 @@ mod tests { assert!(arrow_result); } + #[test] + fn test_check_decimal256_small() { + use arrow_buffer::i256; + + let test_value = i256::from_i128(20075_i128); + let array = Decimal256Array::from(vec![ + i256::from_i128(10050_i128), + test_value, + i256::from_i128(30099_i128), + ]) + .with_precision_and_scale(5, 2) + .unwrap(); + let field = Field::new("col", ArrowType::Decimal256(5, 2), false); + let sbbf = build_sbbf(Arc::new(array), field.clone()); + + // Check without coercion fails + let test_bytes = test_value.to_le_bytes(); + let direct_result = sbbf.check(&test_bytes[..]); + assert!(!direct_result); + + // Arrow Decimal256(5, 2) -> Parquet INT32 + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Decimal256(5, 2)); + let arrow_result = arrow_sbbf.check(&test_bytes[..]); + assert!(arrow_result); + } + + #[test] + fn test_check_decimal256_medium() { + use arrow_buffer::i256; + + let test_value = i256::from_i128(9876543210987_i128); + let array = Decimal256Array::from(vec![ + i256::from_i128(1234567890123_i128), + test_value, + i256::from_i128(5555555555555_i128), + ]) + .with_precision_and_scale(15, 2) + .unwrap(); + let field = Field::new("col", ArrowType::Decimal256(15, 2), false); + let sbbf = build_sbbf(Arc::new(array), field.clone()); + + // Check without coercion fails + let test_bytes = test_value.to_le_bytes(); + let direct_result = sbbf.check(&test_bytes[..]); + assert!(!direct_result); + + // Arrow Decimal256(15, 2) -> Parquet INT64 + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Decimal256(15, 2)); + let arrow_result = arrow_sbbf.check(&test_bytes[..]); + assert!(arrow_result); + } + #[test] fn test_check_decimal32() { let test_value = 20075_i32; From 829dd5294a7b8541d52e7ef864640f457954be49 Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Fri, 3 Oct 2025 17:35:26 -0700 Subject: [PATCH 08/17] fix(parquet): simplify sbbf benchmark --- parquet/benches/bloom_filter.rs | 36 ++++++++++++++------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/parquet/benches/bloom_filter.rs b/parquet/benches/bloom_filter.rs index 21998a03c9f7..d8b7a3d2078a 100644 --- a/parquet/benches/bloom_filter.rs +++ b/parquet/benches/bloom_filter.rs @@ -63,21 +63,20 @@ fn setup_sbbf(array: ArrayRef, field: Field) -> Sbbf { } fn bench_integer_types(c: &mut Criterion) { - // Setup for Int8 benchmarks (requires coercion to i32) - let int8_array = Arc::new(Int8Array::from(vec![42i8; 1000])) as ArrayRef; + // Setup for Int8 benchmarks + let test_val_i8 = 42i8; + let int8_array = Arc::new(Int8Array::from(vec![test_val_i8; 1000])) as ArrayRef; let int8_field = Field::new("col", DataType::Int8, false); let sbbf_int8 = setup_sbbf(int8_array, int8_field); let arrow_sbbf_int8 = ArrowSbbf::new(&sbbf_int8, &DataType::Int8); - let test_val_i32 = 42i32; - let test_val_i8 = 42i8; - // Setup for Int32 benchmarks (no coercion needed) - let int32_array = Arc::new(Int32Array::from(vec![42i32; 1000])) as ArrayRef; + // Setup for Int32 benchmarks + let test_val_i32 = 42i32; + let int32_array = Arc::new(Int32Array::from(vec![test_val_i32; 1000])) as ArrayRef; let int32_field = Field::new("col", DataType::Int32, false); let sbbf_int32 = setup_sbbf(int32_array, int32_field); let arrow_sbbf_int32 = ArrowSbbf::new(&sbbf_int32, &DataType::Int32); - // Benchmark 1: Direct Sbbf::check with i32 (baseline) c.bench_function("Sbbf::check i32", |b| { b.iter(|| { let result = sbbf_int8.check(&test_val_i32); @@ -85,7 +84,6 @@ fn bench_integer_types(c: &mut Criterion) { }) }); - // Benchmark 2: ArrowSbbf::check with i8 (requires coercion to i32) c.bench_function("ArrowSbbf::check i8 (coerce to i32)", |b| { b.iter(|| { let result = arrow_sbbf_int8.check(&test_val_i8); @@ -93,7 +91,6 @@ fn bench_integer_types(c: &mut Criterion) { }) }); - // Benchmark 3: ArrowSbbf::check with i32 (no coercion, passthrough) c.bench_function("ArrowSbbf::check i32 (no coercion)", |b| { b.iter(|| { let result = arrow_sbbf_int32.check(&test_val_i32); @@ -103,40 +100,39 @@ fn bench_integer_types(c: &mut Criterion) { } fn bench_decimal_types(c: &mut Criterion) { - // Setup for Decimal128 small precision (coerces to i32) + // Setup for Decimal128 small precision + let test_val_dec_small = 123456i128; let dec_small_array = Arc::new( - Decimal128Array::from(vec![123456i128; 1000]) + Decimal128Array::from(vec![test_val_dec_small; 1000]) .with_precision_and_scale(5, 2) .unwrap(), ) as ArrayRef; let dec_small_field = Field::new("col", DataType::Decimal128(5, 2), false); let sbbf_dec_small = setup_sbbf(dec_small_array, dec_small_field); let arrow_sbbf_dec_small = ArrowSbbf::new(&sbbf_dec_small, &DataType::Decimal128(5, 2)); - let test_val_dec_small = 123456i128; - // Setup for Decimal128 medium precision (coerces to i64) + // Setup for Decimal128 medium precision + let test_val_dec_medium = 123456789012345i128; let dec_medium_array = Arc::new( - Decimal128Array::from(vec![123456789012345i128; 1000]) + Decimal128Array::from(vec![test_val_dec_medium; 1000]) .with_precision_and_scale(15, 2) .unwrap(), ) as ArrayRef; let dec_medium_field = Field::new("col", DataType::Decimal128(15, 2), false); let sbbf_dec_medium = setup_sbbf(dec_medium_array, dec_medium_field); let arrow_sbbf_dec_medium = ArrowSbbf::new(&sbbf_dec_medium, &DataType::Decimal128(15, 2)); - let test_val_dec_medium = 123456789012345i128; - // Setup for Decimal128 large precision (no coercion) + // Setup for Decimal128 large precision + let test_val_dec_large = 123456789012345678901234567890i128; let dec_large_array = Arc::new( - Decimal128Array::from(vec![123456789012345678901234567890i128; 1000]) + Decimal128Array::from(vec![test_val_dec_large; 1000]) .with_precision_and_scale(30, 2) .unwrap(), ) as ArrayRef; let dec_large_field = Field::new("col", DataType::Decimal128(30, 2), false); let sbbf_dec_large = setup_sbbf(dec_large_array, dec_large_field); let arrow_sbbf_dec_large = ArrowSbbf::new(&sbbf_dec_large, &DataType::Decimal128(30, 2)); - let test_val_dec_large = 123456789012345678901234567890i128; - // Benchmark 1: ArrowSbbf::check with Decimal128 small (coerce to i32) c.bench_function("ArrowSbbf::check Decimal128(5,2) (coerce to i32)", |b| { b.iter(|| { let test_bytes = test_val_dec_small.to_le_bytes(); @@ -145,7 +141,6 @@ fn bench_decimal_types(c: &mut Criterion) { }) }); - // Benchmark 2: ArrowSbbf::check with Decimal128 medium (coerce to i64) c.bench_function("ArrowSbbf::check Decimal128(15,2) (coerce to i64)", |b| { b.iter(|| { let test_bytes = test_val_dec_medium.to_le_bytes(); @@ -154,7 +149,6 @@ fn bench_decimal_types(c: &mut Criterion) { }) }); - // Benchmark 3: ArrowSbbf::check with Decimal128 large (no coercion) c.bench_function("ArrowSbbf::check Decimal128(30,2) (no coercion)", |b| { b.iter(|| { let test_bytes = test_val_dec_large.to_le_bytes(); From 094564c1b4adb7f0e95654717c1ac4172d543254 Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Fri, 3 Oct 2025 18:47:11 -0700 Subject: [PATCH 09/17] fix(parquet): simplify sbbf decimal coercion --- parquet/src/arrow/bloom_filter.rs | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/parquet/src/arrow/bloom_filter.rs b/parquet/src/arrow/bloom_filter.rs index dd30dd8a0b2b..2645a21458fb 100644 --- a/parquet/src/arrow/bloom_filter.rs +++ b/parquet/src/arrow/bloom_filter.rs @@ -251,12 +251,7 @@ impl<'a> ArrowSbbf<'a> { ); if bytes.len() == 16 { - let i128_val = i128::from_le_bytes([ - bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], - bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], - bytes[14], bytes[15], - ]); - let i32_val = i128_val as i32; + let i32_val = i32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]); self.sbbf.check(&i32_val) } else { true // Unexpected byte length, return false positive @@ -274,13 +269,7 @@ impl<'a> ArrowSbbf<'a> { ); if bytes.len() == 32 { - // Read first 16 bytes as i128, then truncate to i32 - let i128_val = i128::from_le_bytes([ - bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], - bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], - bytes[14], bytes[15], - ]); - let i32_val = i128_val as i32; + let i32_val = i32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]); self.sbbf.check(&i32_val) } else { true // Unexpected byte length, return false positive @@ -319,13 +308,10 @@ impl<'a> ArrowSbbf<'a> { ); if bytes.len() == 16 { - // Read first 16 bytes as i128, then truncate to i64 - let i128_val = i128::from_le_bytes([ + let i64_val = i64::from_le_bytes([ bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], - bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], - bytes[14], bytes[15], + bytes[7], ]); - let i64_val = i128_val as i64; self.sbbf.check(&i64_val) } else { true // Unexpected byte length, return false positive @@ -343,13 +329,10 @@ impl<'a> ArrowSbbf<'a> { ); if bytes.len() == 32 { - // Read first 16 bytes as i128, then truncate to i64 - let i128_val = i128::from_le_bytes([ + let i64_val = i64::from_le_bytes([ bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], - bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], - bytes[14], bytes[15], + bytes[7], ]); - let i64_val = i128_val as i64; self.sbbf.check(&i64_val) } else { true // Unexpected byte length, return false positive From daea39071023da0bbf37fceb2aa15c50e4261107 Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Fri, 3 Oct 2025 18:47:34 -0700 Subject: [PATCH 10/17] fix(parquet): sbbf doc reference --- parquet/src/arrow/bloom_filter.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/arrow/bloom_filter.rs b/parquet/src/arrow/bloom_filter.rs index 2645a21458fb..f01e4809291a 100644 --- a/parquet/src/arrow/bloom_filter.rs +++ b/parquet/src/arrow/bloom_filter.rs @@ -31,7 +31,7 @@ //! - `UInt8` → `INT32` (u8 → u32 → i32) //! - `UInt16` → `INT32` (u16 → u32 → i32) //! -//! [`ArrowSbbf`] wraps an [`Sbbf`] and an Arrow [`DataType`], automatically coercing +//! [`ArrowSbbf`] wraps an [`Sbbf`] and an Arrow [`arrow_schema::DataType`], automatically coercing //! values to their Parquet representation before checking the bloom filter. //! //! # Example From 0654b40d9fcd31a6a23576c5e09a067eac75838f Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Sat, 4 Oct 2025 08:50:13 -0700 Subject: [PATCH 11/17] fix(parquet): sbbf benchmark noise threshold --- parquet/benches/bloom_filter.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/parquet/benches/bloom_filter.rs b/parquet/benches/bloom_filter.rs index d8b7a3d2078a..5d495696b11d 100644 --- a/parquet/benches/bloom_filter.rs +++ b/parquet/benches/bloom_filter.rs @@ -158,6 +158,20 @@ fn bench_decimal_types(c: &mut Criterion) { }); } -criterion_group!(benches_int, bench_integer_types); -criterion_group!(benches_decimal, bench_decimal_types); +fn config() -> Criterion { + Criterion::default().noise_threshold(0.05) +} + +criterion_group! { + name = benches_int; + config = config(); + targets = bench_integer_types +} + +criterion_group! { + name = benches_decimal; + config = config(); + targets = bench_decimal_types +} + criterion_main!(benches_int, benches_decimal); From ebbf94a02693b9575f0438e14ba9efd077644b74 Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Sat, 4 Oct 2025 09:06:26 -0700 Subject: [PATCH 12/17] feat(parquet): add sbbf check benches --- parquet/benches/bloom_filter.rs | 35 +++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/parquet/benches/bloom_filter.rs b/parquet/benches/bloom_filter.rs index 5d495696b11d..32f9f15b6f3a 100644 --- a/parquet/benches/bloom_filter.rs +++ b/parquet/benches/bloom_filter.rs @@ -77,9 +77,9 @@ fn bench_integer_types(c: &mut Criterion) { let sbbf_int32 = setup_sbbf(int32_array, int32_field); let arrow_sbbf_int32 = ArrowSbbf::new(&sbbf_int32, &DataType::Int32); - c.bench_function("Sbbf::check i32", |b| { + c.bench_function("Sbbf::check i8", |b| { b.iter(|| { - let result = sbbf_int8.check(&test_val_i32); + let result = sbbf_int8.check(&test_val_i8); hint::black_box(result); }) }); @@ -91,6 +91,13 @@ fn bench_integer_types(c: &mut Criterion) { }) }); + c.bench_function("Sbbf::check i32", |b| { + b.iter(|| { + let result = sbbf_int32.check(&test_val_i32); + hint::black_box(result); + }) + }); + c.bench_function("ArrowSbbf::check i32 (no coercion)", |b| { b.iter(|| { let result = arrow_sbbf_int32.check(&test_val_i32); @@ -133,6 +140,14 @@ fn bench_decimal_types(c: &mut Criterion) { let sbbf_dec_large = setup_sbbf(dec_large_array, dec_large_field); let arrow_sbbf_dec_large = ArrowSbbf::new(&sbbf_dec_large, &DataType::Decimal128(30, 2)); + c.bench_function("Sbbf::check Decimal128(5,2)", |b| { + b.iter(|| { + let i32_val = test_val_dec_small as i32; + let result = sbbf_dec_small.check(&i32_val); + hint::black_box(result); + }) + }); + c.bench_function("ArrowSbbf::check Decimal128(5,2) (coerce to i32)", |b| { b.iter(|| { let test_bytes = test_val_dec_small.to_le_bytes(); @@ -141,6 +156,14 @@ fn bench_decimal_types(c: &mut Criterion) { }) }); + c.bench_function("Sbbf::check Decimal128(15,2)", |b| { + b.iter(|| { + let i64_val = test_val_dec_medium as i64; + let result = sbbf_dec_medium.check(&i64_val); + hint::black_box(result); + }) + }); + c.bench_function("ArrowSbbf::check Decimal128(15,2) (coerce to i64)", |b| { b.iter(|| { let test_bytes = test_val_dec_medium.to_le_bytes(); @@ -149,6 +172,14 @@ fn bench_decimal_types(c: &mut Criterion) { }) }); + c.bench_function("Sbbf::check Decimal128(30,2)", |b| { + b.iter(|| { + let test_bytes = test_val_dec_large.to_le_bytes(); + let result = sbbf_dec_large.check(&test_bytes[..]); + hint::black_box(result); + }) + }); + c.bench_function("ArrowSbbf::check Decimal128(30,2) (no coercion)", |b| { b.iter(|| { let test_bytes = test_val_dec_large.to_le_bytes(); From cbf6db215a1abd340c50572cbd83d664da2dc3d0 Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Sat, 4 Oct 2025 09:41:46 -0700 Subject: [PATCH 13/17] fix(parquet): isolate decimal sbbf benches --- parquet/benches/bloom_filter.rs | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/parquet/benches/bloom_filter.rs b/parquet/benches/bloom_filter.rs index 32f9f15b6f3a..6460b2090a6d 100644 --- a/parquet/benches/bloom_filter.rs +++ b/parquet/benches/bloom_filter.rs @@ -109,6 +109,7 @@ fn bench_integer_types(c: &mut Criterion) { fn bench_decimal_types(c: &mut Criterion) { // Setup for Decimal128 small precision let test_val_dec_small = 123456i128; + let test_bytes_dec_small = test_val_dec_small.to_le_bytes(); let dec_small_array = Arc::new( Decimal128Array::from(vec![test_val_dec_small; 1000]) .with_precision_and_scale(5, 2) @@ -120,6 +121,7 @@ fn bench_decimal_types(c: &mut Criterion) { // Setup for Decimal128 medium precision let test_val_dec_medium = 123456789012345i128; + let test_bytes_dec_medium = test_val_dec_medium.to_le_bytes(); let dec_medium_array = Arc::new( Decimal128Array::from(vec![test_val_dec_medium; 1000]) .with_precision_and_scale(15, 2) @@ -131,6 +133,7 @@ fn bench_decimal_types(c: &mut Criterion) { // Setup for Decimal128 large precision let test_val_dec_large = 123456789012345678901234567890i128; + let test_bytes_dec_large = test_val_dec_large.to_le_bytes(); let dec_large_array = Arc::new( Decimal128Array::from(vec![test_val_dec_large; 1000]) .with_precision_and_scale(30, 2) @@ -142,48 +145,42 @@ fn bench_decimal_types(c: &mut Criterion) { c.bench_function("Sbbf::check Decimal128(5,2)", |b| { b.iter(|| { - let i32_val = test_val_dec_small as i32; - let result = sbbf_dec_small.check(&i32_val); + let result = sbbf_dec_small.check(&test_bytes_dec_small[..]); hint::black_box(result); }) }); c.bench_function("ArrowSbbf::check Decimal128(5,2) (coerce to i32)", |b| { b.iter(|| { - let test_bytes = test_val_dec_small.to_le_bytes(); - let result = arrow_sbbf_dec_small.check(&test_bytes[..]); + let result = arrow_sbbf_dec_small.check(&test_bytes_dec_small[..]); hint::black_box(result); }) }); c.bench_function("Sbbf::check Decimal128(15,2)", |b| { b.iter(|| { - let i64_val = test_val_dec_medium as i64; - let result = sbbf_dec_medium.check(&i64_val); + let result = sbbf_dec_medium.check(&test_bytes_dec_medium[..]); hint::black_box(result); }) }); c.bench_function("ArrowSbbf::check Decimal128(15,2) (coerce to i64)", |b| { b.iter(|| { - let test_bytes = test_val_dec_medium.to_le_bytes(); - let result = arrow_sbbf_dec_medium.check(&test_bytes[..]); + let result = arrow_sbbf_dec_medium.check(&test_bytes_dec_medium[..]); hint::black_box(result); }) }); c.bench_function("Sbbf::check Decimal128(30,2)", |b| { b.iter(|| { - let test_bytes = test_val_dec_large.to_le_bytes(); - let result = sbbf_dec_large.check(&test_bytes[..]); + let result = sbbf_dec_large.check(&test_bytes_dec_large[..]); hint::black_box(result); }) }); c.bench_function("ArrowSbbf::check Decimal128(30,2) (no coercion)", |b| { b.iter(|| { - let test_bytes = test_val_dec_large.to_le_bytes(); - let result = arrow_sbbf_dec_large.check(&test_bytes[..]); + let result = arrow_sbbf_dec_large.check(&test_bytes_dec_large[..]); hint::black_box(result); }) }); From 93a74b1e75a6b937de2f5adb57fd80a3d94a3464 Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Sat, 4 Oct 2025 09:42:34 -0700 Subject: [PATCH 14/17] chore(parquet): reorder arrow sbbf tests --- parquet/src/arrow/bloom_filter.rs | 72 +++++++++++++++---------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/parquet/src/arrow/bloom_filter.rs b/parquet/src/arrow/bloom_filter.rs index f01e4809291a..cc20dec399c5 100644 --- a/parquet/src/arrow/bloom_filter.rs +++ b/parquet/src/arrow/bloom_filter.rs @@ -502,6 +502,42 @@ mod tests { assert!(arrow_sbbf.check(&test_value)); } + #[test] + fn test_check_float16() { + use half::f16; + let test_value = f16::from_f32(2.5); + let array = Float16Array::from(vec![f16::from_f32(1.5), test_value, f16::from_f32(3.5)]); + let field = Field::new("col", ArrowType::Float16, false); + let sbbf = build_sbbf(Arc::new(array), field.clone()); + + // No coercion necessary + let test_bytes = test_value.to_le_bytes(); + let direct_result = sbbf.check(&test_bytes[..]); + assert!(direct_result); + + // Arrow Float16 -> Parquet FIXED_LEN_BYTE_ARRAY + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Float16); + let arrow_result = arrow_sbbf.check(&test_bytes[..]); + assert!(arrow_result); + } + + #[test] + fn test_check_decimal32() { + let test_value = 20075_i32; + let array = Decimal32Array::from(vec![10050_i32, test_value, 30099_i32]) + .with_precision_and_scale(5, 2) + .unwrap(); + let field = Field::new("col", ArrowType::Decimal32(5, 2), false); + let sbbf = build_sbbf(Arc::new(array), field.clone()); + + // No coercion necessary + assert!(sbbf.check(&test_value)); + + // Arrow Decimal32(5, 2) -> Parquet INT32 + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Decimal32(5, 2)); + assert!(arrow_sbbf.check(&test_value)); + } + #[test] fn test_check_decimal128_small() { let test_value = 20075_i128; @@ -594,42 +630,6 @@ mod tests { assert!(arrow_result); } - #[test] - fn test_check_decimal32() { - let test_value = 20075_i32; - let array = Decimal32Array::from(vec![10050_i32, test_value, 30099_i32]) - .with_precision_and_scale(5, 2) - .unwrap(); - let field = Field::new("col", ArrowType::Decimal32(5, 2), false); - let sbbf = build_sbbf(Arc::new(array), field.clone()); - - // No coercion necessary - assert!(sbbf.check(&test_value)); - - // Arrow Decimal32(5, 2) -> Parquet INT32 - let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Decimal32(5, 2)); - assert!(arrow_sbbf.check(&test_value)); - } - - #[test] - fn test_check_float16() { - use half::f16; - let test_value = f16::from_f32(2.5); - let array = Float16Array::from(vec![f16::from_f32(1.5), test_value, f16::from_f32(3.5)]); - let field = Field::new("col", ArrowType::Float16, false); - let sbbf = build_sbbf(Arc::new(array), field.clone()); - - // No coercion necessary - let test_bytes = test_value.to_le_bytes(); - let direct_result = sbbf.check(&test_bytes[..]); - assert!(direct_result); - - // Arrow Float16 -> Parquet FIXED_LEN_BYTE_ARRAY - let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Float16); - let arrow_result = arrow_sbbf.check(&test_bytes[..]); - assert!(arrow_result); - } - #[test] fn test_check_date64_default() { let test_value = 2_000_000_000_i64; From 981eaaaa2f116277bb193ec127ca0d691c65b96f Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Sat, 4 Oct 2025 14:12:16 -0700 Subject: [PATCH 15/17] fix(parquet): remove unnecessary arrow sbbf coercion adds some tests --- parquet/src/arrow/bloom_filter.rs | 81 +++++++++++++++---------------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/parquet/src/arrow/bloom_filter.rs b/parquet/src/arrow/bloom_filter.rs index cc20dec399c5..dad54475afee 100644 --- a/parquet/src/arrow/bloom_filter.rs +++ b/parquet/src/arrow/bloom_filter.rs @@ -199,24 +199,6 @@ impl<'a> ArrowSbbf<'a> { true // Unexpected byte length, return false positive } } - ArrowType::Decimal32(precision, _) if *precision >= 1 && *precision <= 9 => { - // Arrow Decimal32 (precision 1-9) -> Parquet INT32 - let bytes = value.as_bytes(); - - debug_assert_eq!( - bytes.len(), - 4, - "ArrowSbbf: expected 4 bytes for Decimal32, got {}. This indicates a type mismatch.", - bytes.len() - ); - - if bytes.len() == 4 { - let i32_val = i32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]); - self.sbbf.check(&i32_val) - } else { - true // Unexpected byte length, return false positive - } - } ArrowType::Decimal64(precision, _) if *precision >= 1 && *precision <= 9 => { // Arrow Decimal64 (precision 1-9) -> Parquet INT32 let bytes = value.as_bytes(); @@ -275,27 +257,6 @@ impl<'a> ArrowSbbf<'a> { true // Unexpected byte length, return false positive } } - ArrowType::Decimal64(precision, _) if *precision >= 10 && *precision <= 18 => { - // Arrow Decimal64 (precision 10-18) -> Parquet INT64 - let bytes = value.as_bytes(); - - debug_assert_eq!( - bytes.len(), - 8, - "ArrowSbbf: expected 8 bytes for Decimal64, got {}. This indicates a type mismatch.", - bytes.len() - ); - - if bytes.len() == 8 { - let i64_val = i64::from_le_bytes([ - bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], - bytes[7], - ]); - self.sbbf.check(&i64_val) - } else { - true // Unexpected byte length, return false positive - } - } ArrowType::Decimal128(precision, _) if *precision >= 10 && *precision <= 18 => { // Arrow Decimal128 (precision 10-18) -> Parquet INT64 let bytes = value.as_bytes(); @@ -352,8 +313,9 @@ mod tests { use crate::file::reader::{FileReader, SerializedFileReader}; use crate::file::serialized_reader::ReadOptionsBuilder; use arrow_array::{ - ArrayRef, Date64Array, Decimal128Array, Decimal256Array, Decimal32Array, Float16Array, - Int16Array, Int8Array, RecordBatch, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + ArrayRef, Date64Array, Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array, + Float16Array, Int16Array, Int8Array, RecordBatch, UInt16Array, UInt32Array, UInt64Array, + UInt8Array, }; use arrow_schema::{Field, Schema}; use std::sync::Arc; @@ -538,6 +500,43 @@ mod tests { assert!(arrow_sbbf.check(&test_value)); } + #[test] + fn test_check_decimal64_small() { + let test_value = 20075_i64; + let array = Decimal64Array::from(vec![10050_i64, test_value, 30099_i64]) + .with_precision_and_scale(5, 2) + .unwrap(); + let field = Field::new("col", ArrowType::Decimal64(5, 2), false); + let sbbf = build_sbbf(Arc::new(array), field.clone()); + + // Check without coercion fails + let test_bytes = test_value.to_le_bytes(); + let direct_result = sbbf.check(&test_bytes[..]); + assert!(!direct_result); + + // Arrow Decimal64(5, 2) -> Parquet INT32 + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Decimal64(5, 2)); + let arrow_result = arrow_sbbf.check(&test_bytes[..]); + assert!(arrow_result); + } + + #[test] + fn test_check_decimal64_medium() { + let test_value = 9876543210987_i64; + let array = Decimal64Array::from(vec![1234567890123_i64, test_value, 5555555555555_i64]) + .with_precision_and_scale(15, 2) + .unwrap(); + let field = Field::new("col", ArrowType::Decimal64(15, 2), false); + let sbbf = build_sbbf(Arc::new(array), field.clone()); + + // No coercion necessary + assert!(sbbf.check(&test_value)); + + // Arrow Decimal64(15, 2) -> Parquet INT64 + let arrow_sbbf = ArrowSbbf::new(&sbbf, &ArrowType::Decimal64(15, 2)); + assert!(arrow_sbbf.check(&test_value)); + } + #[test] fn test_check_decimal128_small() { let test_value = 20075_i128; From 3b70bf860184fb91189be16e2ca6f978cc98b9b2 Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Sat, 4 Oct 2025 14:34:20 -0700 Subject: [PATCH 16/17] fix(parquet): simplify arrow sbbf coercion logic --- parquet/src/arrow/bloom_filter.rs | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/parquet/src/arrow/bloom_filter.rs b/parquet/src/arrow/bloom_filter.rs index dad54475afee..0da019d78f4c 100644 --- a/parquet/src/arrow/bloom_filter.rs +++ b/parquet/src/arrow/bloom_filter.rs @@ -172,9 +172,7 @@ impl<'a> ArrowSbbf<'a> { ); if bytes.len() == 1 { - let u8_val = bytes[0]; - let u32_val = u8_val as u32; - let i32_val = u32_val as i32; + let i32_val = bytes[0] as i32; self.sbbf.check(&i32_val) } else { true // Unexpected byte length, return false positive @@ -192,8 +190,7 @@ impl<'a> ArrowSbbf<'a> { if bytes.len() == 2 { let u16_val = u16::from_le_bytes([bytes[0], bytes[1]]); - let u32_val = u16_val as u32; - let i32_val = u32_val as i32; + let i32_val = u16_val as i32; self.sbbf.check(&i32_val) } else { true // Unexpected byte length, return false positive @@ -211,11 +208,7 @@ impl<'a> ArrowSbbf<'a> { ); if bytes.len() == 8 { - let i64_val = i64::from_le_bytes([ - bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], - bytes[7], - ]); - let i32_val = i64_val as i32; + let i32_val = i32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]); self.sbbf.check(&i32_val) } else { true // Unexpected byte length, return false positive From 2aae14e9da95ced134ea29cde4b595657236cd0c Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Sat, 4 Oct 2025 14:42:27 -0700 Subject: [PATCH 17/17] chore(parquet): reduce sbbf bench array size --- parquet/benches/bloom_filter.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/parquet/benches/bloom_filter.rs b/parquet/benches/bloom_filter.rs index 6460b2090a6d..e732615400ca 100644 --- a/parquet/benches/bloom_filter.rs +++ b/parquet/benches/bloom_filter.rs @@ -65,14 +65,14 @@ fn setup_sbbf(array: ArrayRef, field: Field) -> Sbbf { fn bench_integer_types(c: &mut Criterion) { // Setup for Int8 benchmarks let test_val_i8 = 42i8; - let int8_array = Arc::new(Int8Array::from(vec![test_val_i8; 1000])) as ArrayRef; + let int8_array = Arc::new(Int8Array::from(vec![test_val_i8; 1])) as ArrayRef; let int8_field = Field::new("col", DataType::Int8, false); let sbbf_int8 = setup_sbbf(int8_array, int8_field); let arrow_sbbf_int8 = ArrowSbbf::new(&sbbf_int8, &DataType::Int8); // Setup for Int32 benchmarks let test_val_i32 = 42i32; - let int32_array = Arc::new(Int32Array::from(vec![test_val_i32; 1000])) as ArrayRef; + let int32_array = Arc::new(Int32Array::from(vec![test_val_i32; 1])) as ArrayRef; let int32_field = Field::new("col", DataType::Int32, false); let sbbf_int32 = setup_sbbf(int32_array, int32_field); let arrow_sbbf_int32 = ArrowSbbf::new(&sbbf_int32, &DataType::Int32); @@ -111,7 +111,7 @@ fn bench_decimal_types(c: &mut Criterion) { let test_val_dec_small = 123456i128; let test_bytes_dec_small = test_val_dec_small.to_le_bytes(); let dec_small_array = Arc::new( - Decimal128Array::from(vec![test_val_dec_small; 1000]) + Decimal128Array::from(vec![test_val_dec_small; 1]) .with_precision_and_scale(5, 2) .unwrap(), ) as ArrayRef; @@ -123,7 +123,7 @@ fn bench_decimal_types(c: &mut Criterion) { let test_val_dec_medium = 123456789012345i128; let test_bytes_dec_medium = test_val_dec_medium.to_le_bytes(); let dec_medium_array = Arc::new( - Decimal128Array::from(vec![test_val_dec_medium; 1000]) + Decimal128Array::from(vec![test_val_dec_medium; 1]) .with_precision_and_scale(15, 2) .unwrap(), ) as ArrayRef; @@ -135,7 +135,7 @@ fn bench_decimal_types(c: &mut Criterion) { let test_val_dec_large = 123456789012345678901234567890i128; let test_bytes_dec_large = test_val_dec_large.to_le_bytes(); let dec_large_array = Arc::new( - Decimal128Array::from(vec![test_val_dec_large; 1000]) + Decimal128Array::from(vec![test_val_dec_large; 1]) .with_precision_and_scale(30, 2) .unwrap(), ) as ArrayRef;