Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
125 commits
Select commit Hold shift + click to select a range
e3a0b50
custom PageLocation decoder for speed
etseidl Aug 20, 2025
71d3859
fix recently added test
etseidl Aug 20, 2025
ff42e5a
clippy
etseidl Aug 20, 2025
1f2c216
experimental new form for column index
etseidl Aug 20, 2025
37f3b20
fix for test added in main
etseidl Aug 21, 2025
3d4e28e
refactor new column index
etseidl Aug 21, 2025
2b85b89
checkpoint...everything but stats converter
etseidl Aug 21, 2025
5ee1b8f
fix bug found in testing
etseidl Aug 21, 2025
624b88b
Merge branch 'new_col_idx' into new_col_idx_full
etseidl Aug 21, 2025
d99a06a
stats converter works
etseidl Aug 22, 2025
79a6917
get rid of import
etseidl Aug 22, 2025
878d460
get parquet-index working
etseidl Aug 22, 2025
009632a
doc fixes
etseidl Aug 22, 2025
998ac6c
Merge branch 'offset_idx_speedup' into new_col_idx_full
etseidl Aug 22, 2025
a822dfd
move column index to its own module
etseidl Aug 22, 2025
20df075
add ColumnIndexIterators trait, simplify stats converter a little
etseidl Aug 22, 2025
7755b7b
restore comment
etseidl Aug 22, 2025
66ed8bc
Merge branch 'new_col_idx' into new_col_idx_full
etseidl Aug 22, 2025
f6c5738
further rework...allow for fallback to slow decoder
etseidl Aug 24, 2025
3733b86
Merge branch 'offset_idx_speedup' into new_col_idx_full
etseidl Aug 24, 2025
09d71e1
refactor a bit
etseidl Aug 24, 2025
1ddaa35
simplify reading of int array
etseidl Aug 24, 2025
006d59d
Merge branch 'offset_idx_speedup' into new_col_idx_full
etseidl Aug 24, 2025
c271085
get write working for enum and some unions
etseidl Aug 25, 2025
34cdaf2
make test_roundtrip visible
etseidl Aug 25, 2025
c9be570
add test for converted_type, start on logical_type
etseidl Aug 25, 2025
a9cd09d
checkpoint struct field writing
etseidl Aug 25, 2025
ae65167
get some struct examples and lists working
etseidl Aug 25, 2025
272a013
get rid of copied allow
etseidl Aug 25, 2025
632e171
get writer macros for structs working
etseidl Aug 26, 2025
9f01b60
fix bug in struct macro
etseidl Aug 26, 2025
2511f8f
make Repetition public
etseidl Aug 26, 2025
61e9e07
get union working for writes
etseidl Aug 26, 2025
e39f119
add some tests
etseidl Aug 26, 2025
def3d07
redo OrderedF64 initialization
etseidl Aug 26, 2025
386f222
unused import
etseidl Aug 26, 2025
7ae2304
Merge branch 'gh5854_thrift_remodel' into write_thrift
etseidl Aug 26, 2025
6beb79d
get decryption working
etseidl Aug 26, 2025
1eaa17b
refactor and clippy fixes
etseidl Aug 26, 2025
713e38a
add page header defs
etseidl Aug 26, 2025
79e8f85
totally rework the input side
etseidl Aug 27, 2025
b31c9e6
rework struct field reading
etseidl Aug 27, 2025
8c4e49d
fix skipping bool fields
etseidl Aug 27, 2025
e0e1852
remove cruft
etseidl Aug 27, 2025
1ebfdf2
Merge branch 'gh5854_thrift_remodel' into write_thrift
etseidl Aug 27, 2025
366326a
Merge branch 'write_thrift' into read_and_crypto
etseidl Aug 27, 2025
7b8777a
Merge branch 'read_and_crypto' into rework_thrift_reader
etseidl Aug 27, 2025
d8081a9
fix clippy issues
etseidl Aug 28, 2025
5d6c8b1
allow unused page header structs
etseidl Aug 28, 2025
709e813
remove Write from WriteThrift
etseidl Aug 29, 2025
def1d68
Merge branch 'write_thrift' into read_and_crypto
etseidl Aug 29, 2025
0579456
finish merge
etseidl Aug 29, 2025
c1587c4
Merge branch 'read_and_crypto' into rework_thrift_reader
etseidl Aug 29, 2025
04b74f5
stats
etseidl Aug 29, 2025
2250e18
get new page headers working for read and write
etseidl Aug 29, 2025
6af8631
rename page header structs
etseidl Aug 29, 2025
3775222
add some fixmes
etseidl Aug 29, 2025
85f44a5
formatting
etseidl Aug 29, 2025
f0e538f
test results differ depending on features
etseidl Aug 29, 2025
763ecd7
error rather than panic on missing required fields
etseidl Aug 29, 2025
734ee9b
add option to read page stats
etseidl Aug 29, 2025
5569757
add comments
etseidl Aug 29, 2025
23636c9
clippy
etseidl Aug 29, 2025
179bb21
switch page header bench to new code
etseidl Aug 29, 2025
4f7bd62
add comment
etseidl Aug 29, 2025
51cf33a
benchmark changes
etseidl Aug 29, 2025
b4ca56e
update benchmarks to match thrift-remodel feature branch
etseidl Aug 29, 2025
c702a44
add encoding_stats to wide data set
etseidl Aug 30, 2025
0893ec7
clippy
etseidl Aug 30, 2025
689297c
Merge branch 'gh5854_thrift_remodel' into write_thrift
etseidl Aug 30, 2025
7d47857
Merge branch 'write_thrift' into read_and_crypto
etseidl Aug 30, 2025
b543838
Merge branch 'read_and_crypto' into rework_thrift_reader
etseidl Aug 30, 2025
99ee049
Merge branch 'rework_thrift_reader' into read_page_header
etseidl Aug 30, 2025
f158d72
Merge branch 'update_metadata_bench' into read_page_header
etseidl Aug 30, 2025
56f5c5d
remove dup from merge
etseidl Sep 4, 2025
b37029e
checkpoint offset index
etseidl Sep 5, 2025
086d04c
write path for column index
etseidl Sep 5, 2025
ecd24de
copy over tests from index
etseidl Sep 5, 2025
1e510bc
remove index module
etseidl Sep 5, 2025
138b0d5
Merge branch 'gh5854_thrift_remodel' into write_thrift
etseidl Sep 5, 2025
5b6c177
Merge branch 'write_thrift' into read_and_crypto
etseidl Sep 5, 2025
88959be
Merge branch 'read_and_crypto' into rework_thrift_reader
etseidl Sep 5, 2025
9fe5a9a
Merge branch 'rework_thrift_reader' into read_page_header
etseidl Sep 5, 2025
52d73e9
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 5, 2025
29091cd
refactor column index building
etseidl Sep 5, 2025
c729d22
Merge remote-tracking branch 'origin/gh5854_thrift_remodel' into writ…
etseidl Sep 8, 2025
96419c4
Merge branch 'write_thrift' into read_and_crypto
etseidl Sep 8, 2025
6ec102f
Merge branch 'read_and_crypto' into rework_thrift_reader
etseidl Sep 8, 2025
976b36d
Merge branch 'rework_thrift_reader' into read_page_header
etseidl Sep 8, 2025
ceac418
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 8, 2025
f81a732
get a start on some documentation and add some TODOs
etseidl Sep 10, 2025
be58ea6
Merge branch 'write_thrift' into read_and_crypto
etseidl Sep 10, 2025
02e5e16
Merge branch 'read_and_crypto' into rework_thrift_reader
etseidl Sep 10, 2025
61aa392
Merge branch 'rework_thrift_reader' into read_page_header
etseidl Sep 10, 2025
428e84c
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 10, 2025
7268dd3
fix docs
etseidl Sep 10, 2025
8305915
Merge branch 'write_thrift' into read_and_crypto
etseidl Sep 10, 2025
4221646
Merge branch 'read_and_crypto' into rework_thrift_reader
etseidl Sep 10, 2025
4342cb5
Merge branch 'rework_thrift_reader' into read_page_header
etseidl Sep 10, 2025
ddbeb55
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 10, 2025
f0beb0b
Merge branch 'gh5854_thrift_remodel' into read_and_crypto
etseidl Sep 10, 2025
b303e52
Merge branch 'read_and_crypto' into rework_thrift_reader
etseidl Sep 10, 2025
2955b85
Merge branch 'rework_thrift_reader' into read_page_header
etseidl Sep 10, 2025
3d33707
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 10, 2025
cfa6740
backport fix for tests without encryption
etseidl Sep 10, 2025
6c82028
Merge branch 'read_and_crypto' into rework_thrift_reader
etseidl Sep 10, 2025
b16e118
Merge branch 'rework_thrift_reader' into read_page_header
etseidl Sep 10, 2025
1afd866
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 10, 2025
82f31a4
add documentation
etseidl Sep 11, 2025
608c0f3
Merge branch 'rework_thrift_reader' into read_page_header
etseidl Sep 11, 2025
237ca3d
add docs for ThriftReadInputProtocol
etseidl Sep 11, 2025
bdb9aa9
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 11, 2025
4da5d9e
Merge branch 'gh5854_thrift_remodel' into rework_thrift_reader
etseidl Sep 12, 2025
afb4adf
Merge branch 'rework_thrift_reader' into read_page_header
etseidl Sep 12, 2025
9909d0c
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 12, 2025
ebae0af
Merge branch 'gh5854_thrift_remodel' into read_page_header
etseidl Sep 17, 2025
7560e70
fix typo
etseidl Sep 17, 2025
e94a2de
fix typo
etseidl Sep 17, 2025
1ff8b88
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 17, 2025
56a75d6
clean up some imports
etseidl Sep 17, 2025
7b549f9
update docs for PageStatistics
etseidl Sep 23, 2025
a6ca284
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 23, 2025
943c674
Merge remote-tracking branch 'origin/gh5854_thrift_remodel' into writ…
etseidl Sep 23, 2025
b9e97c5
Merge branch 'gh5854_thrift_remodel' into write_page_indexes
etseidl Sep 23, 2025
0701d60
backport some doc fixes
etseidl Sep 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion parquet/src/arrow/arrow_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ impl<T> ArrowReaderBuilder<T> {
/// Skip 1100 (skip the remaining 900 rows in row group 2 and the first 200 rows in row group 3)
/// ```
///
/// [`Index`]: crate::file::page_index::index::Index
/// [`Index`]: crate::file::page_index::column_index::ColumnIndexMetaData
pub fn with_row_selection(self, selection: RowSelection) -> Self {
Self {
selection: Some(selection),
Expand Down
2 changes: 1 addition & 1 deletion parquet/src/arrow/arrow_reader/selection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ impl RowSelector {
/// * It contains no [`RowSelector`] of 0 rows
/// * Consecutive [`RowSelector`]s alternate skipping or selecting rows
///
/// [`PageIndex`]: crate::file::page_index::index::PageIndex
/// [`PageIndex`]: crate::file::page_index::column_index::ColumnIndexMetaData
#[derive(Debug, Clone, Default, Eq, PartialEq)]
pub struct RowSelection {
selectors: Vec<RowSelector>,
Expand Down
76 changes: 35 additions & 41 deletions parquet/src/column/writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use bytes::Bytes;
use half::f16;

use crate::bloom_filter::Sbbf;
use crate::file::page_index::index::Index;
use crate::file::page_index::column_index::ColumnIndexMetaData;
use crate::file::page_index::offset_index::OffsetIndexMetaData;
use std::collections::{BTreeSet, VecDeque};
use std::str;
Expand Down Expand Up @@ -192,7 +192,7 @@ pub struct ColumnCloseResult {
/// Optional bloom filter for this column
pub bloom_filter: Option<Sbbf>,
/// Optional column index, for filtering
pub column_index: Option<Index>,
pub column_index: Option<ColumnIndexMetaData>,
/// Optional offset index, identifying page locations
pub offset_index: Option<OffsetIndexMetaData>,
}
Expand Down Expand Up @@ -2959,28 +2959,22 @@ mod tests {
assert!(r.column_index.is_some());
let col_idx = r.column_index.unwrap();
let col_idx = match col_idx {
Index::INT32(col_idx) => col_idx,
ColumnIndexMetaData::INT32(col_idx) => col_idx,
_ => panic!("wrong stats type"),
};
// null_pages should be true for page 0
assert!(col_idx.indexes[0].is_null_page());
assert!(col_idx.is_null_page(0));
// min and max should be empty byte arrays
assert!(col_idx.indexes[0].min().is_none());
assert!(col_idx.indexes[0].max().is_none());
assert!(col_idx.min_value(0).is_none());
assert!(col_idx.max_value(0).is_none());
// null_counts should be defined and be 4 for page 0
assert!(col_idx.indexes[0].null_count().is_some());
assert_eq!(col_idx.indexes[0].null_count().unwrap(), 4);
assert!(col_idx.null_count(0).is_some());
assert_eq!(col_idx.null_count(0), Some(4));
// there is no repetition so rep histogram should be absent
assert!(col_idx.indexes[0].repetition_level_histogram().is_none());
assert!(col_idx.repetition_level_histogram(0).is_none());
// definition_level_histogram should be present and should be 0:4, 1:0
assert!(col_idx.indexes[0].definition_level_histogram().is_some());
assert_eq!(
col_idx.indexes[0]
.definition_level_histogram()
.unwrap()
.values(),
&[4, 0]
);
assert!(col_idx.definition_level_histogram(0).is_some());
assert_eq!(col_idx.definition_level_histogram(0).unwrap(), &[4, 0]);
}

#[test]
Expand All @@ -3004,15 +2998,15 @@ mod tests {

// column index
let column_index = match column_index {
Index::INT32(column_index) => column_index,
ColumnIndexMetaData::INT32(column_index) => column_index,
_ => panic!("wrong stats type"),
};
assert_eq!(2, column_index.indexes.len());
assert_eq!(2, column_index.num_pages());
assert_eq!(2, offset_index.page_locations.len());
assert_eq!(BoundaryOrder::UNORDERED, column_index.boundary_order);
for idx in 0..2 {
assert!(!column_index.indexes[idx].is_null_page());
assert_eq!(0, *column_index.indexes[idx].null_count.as_ref().unwrap());
assert!(!column_index.is_null_page(idx));
assert_eq!(0, column_index.null_count(0).unwrap());
}

if let Some(stats) = r.metadata.statistics() {
Expand All @@ -3022,8 +3016,8 @@ mod tests {
// first page is [1,2,3,4]
// second page is [-5,2,4,8]
// note that we don't increment here, as this is a non BinaryArray type.
assert_eq!(stats.min_opt(), column_index.indexes[1].min());
assert_eq!(stats.max_opt(), column_index.indexes[1].max());
assert_eq!(stats.min_opt(), column_index.min_value(1));
assert_eq!(stats.max_opt(), column_index.max_value(1));
} else {
panic!("expecting Statistics::Int32");
}
Expand Down Expand Up @@ -3064,25 +3058,25 @@ mod tests {
let offset_index = r.offset_index.unwrap();

let column_index = match column_index {
Index::FIXED_LEN_BYTE_ARRAY(column_index) => column_index,
ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(column_index) => column_index,
_ => panic!("wrong stats type"),
};

assert_eq!(3, r.rows_written);

// column index
assert_eq!(1, column_index.indexes.len());
assert_eq!(1, column_index.num_pages());
assert_eq!(1, offset_index.page_locations.len());
assert_eq!(BoundaryOrder::ASCENDING, column_index.boundary_order);
assert!(!column_index.indexes[0].is_null_page());
assert_eq!(Some(0), column_index.indexes[0].null_count());
assert!(!column_index.is_null_page(0));
assert_eq!(Some(0), column_index.null_count(0));

if let Some(stats) = r.metadata.statistics() {
assert_eq!(stats.null_count_opt(), Some(0));
assert_eq!(stats.distinct_count_opt(), None);
if let Statistics::FixedLenByteArray(stats) = stats {
let column_index_min_value = column_index.indexes[0].min_bytes().unwrap();
let column_index_max_value = column_index.indexes[0].max_bytes().unwrap();
let column_index_min_value = column_index.min_value(0).unwrap();
let column_index_max_value = column_index.max_value(0).unwrap();

// Column index stats are truncated, while the column chunk's aren't.
assert_ne!(stats.min_bytes_opt().unwrap(), column_index_min_value);
Expand Down Expand Up @@ -3135,25 +3129,25 @@ mod tests {
let offset_index = r.offset_index.unwrap();

let column_index = match column_index {
Index::FIXED_LEN_BYTE_ARRAY(column_index) => column_index,
ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(column_index) => column_index,
_ => panic!("wrong stats type"),
};

assert_eq!(1, r.rows_written);

// column index
assert_eq!(1, column_index.indexes.len());
assert_eq!(1, column_index.num_pages());
assert_eq!(1, offset_index.page_locations.len());
assert_eq!(BoundaryOrder::ASCENDING, column_index.boundary_order);
assert!(!column_index.indexes[0].is_null_page());
assert_eq!(Some(0), column_index.indexes[0].null_count());
assert!(!column_index.is_null_page(0));
assert_eq!(Some(0), column_index.null_count(0));

if let Some(stats) = r.metadata.statistics() {
assert_eq!(stats.null_count_opt(), Some(0));
assert_eq!(stats.distinct_count_opt(), None);
if let Statistics::FixedLenByteArray(_stats) = stats {
let column_index_min_value = column_index.indexes[0].min_bytes().unwrap();
let column_index_max_value = column_index.indexes[0].max_bytes().unwrap();
let column_index_min_value = column_index.min_value(0).unwrap();
let column_index_max_value = column_index.max_value(0).unwrap();

assert_eq!(column_index_min_value.len(), 1);
assert_eq!(column_index_max_value.len(), 1);
Expand Down Expand Up @@ -3190,11 +3184,11 @@ mod tests {
// ensure bytes weren't truncated for column index
let column_index = r.column_index.unwrap();
let column_index = match column_index {
Index::FIXED_LEN_BYTE_ARRAY(column_index) => column_index,
ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(column_index) => column_index,
_ => panic!("wrong stats type"),
};
let column_index_min_bytes = column_index.indexes[0].min_bytes().unwrap();
let column_index_max_bytes = column_index.indexes[0].min_bytes().unwrap();
let column_index_min_bytes = column_index.min_value(0).unwrap();
let column_index_max_bytes = column_index.max_value(0).unwrap();
assert_eq!(expected_value, column_index_min_bytes);
assert_eq!(expected_value, column_index_max_bytes);

Expand Down Expand Up @@ -3233,11 +3227,11 @@ mod tests {
// ensure bytes weren't truncated for column index
let column_index = r.column_index.unwrap();
let column_index = match column_index {
Index::FIXED_LEN_BYTE_ARRAY(column_index) => column_index,
ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(column_index) => column_index,
_ => panic!("wrong stats type"),
};
let column_index_min_bytes = column_index.indexes[0].min_bytes().unwrap();
let column_index_max_bytes = column_index.indexes[0].min_bytes().unwrap();
let column_index_min_bytes = column_index.min_value(0).unwrap();
let column_index_max_bytes = column_index.max_value(0).unwrap();
assert_eq!(expected_value, column_index_min_bytes);
assert_eq!(expected_value, column_index_max_bytes);

Expand Down
29 changes: 0 additions & 29 deletions parquet/src/file/metadata/memory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ use crate::file::page_encoding_stats::PageEncodingStats;
use crate::file::page_index::column_index::{
ByteArrayColumnIndex, ColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex,
};
use crate::file::page_index::index::{Index, NativeIndex, PageIndex};
use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation};
use crate::file::statistics::{Statistics, ValueStatistics};
use std::sync::Arc;
Expand Down Expand Up @@ -199,34 +198,6 @@ impl HeapSize for ByteArrayColumnIndex {
}
}

impl HeapSize for Index {
fn heap_size(&self) -> usize {
match self {
Index::NONE => 0,
Index::BOOLEAN(native_index) => native_index.heap_size(),
Index::INT32(native_index) => native_index.heap_size(),
Index::INT64(native_index) => native_index.heap_size(),
Index::INT96(native_index) => native_index.heap_size(),
Index::FLOAT(native_index) => native_index.heap_size(),
Index::DOUBLE(native_index) => native_index.heap_size(),
Index::BYTE_ARRAY(native_index) => native_index.heap_size(),
Index::FIXED_LEN_BYTE_ARRAY(native_index) => native_index.heap_size(),
}
}
}

impl<T: ParquetValueType> HeapSize for NativeIndex<T> {
fn heap_size(&self) -> usize {
self.indexes.heap_size() + self.boundary_order.heap_size()
}
}

impl<T: ParquetValueType> HeapSize for PageIndex<T> {
fn heap_size(&self) -> usize {
self.min.heap_size() + self.max.heap_size() + self.null_count.heap_size()
}
}

impl<T: ParquetValueType> HeapSize for ValueStatistics<T> {
fn heap_size(&self) -> usize {
self.min_opt().map(T::heap_size).unwrap_or(0)
Expand Down
Loading
Loading