Skip to content

Commit 1404608

Browse files
authored
[thrift-remodel] Add thrift write support (#8237)
# Which issue does this PR close? **Note: this targets a feature branch, not main** - Part of #5854. # Rationale for this change Begins adding custom thrift write support. # What changes are included in this PR? Adds traits to aid in writing of thrift and modifies thrift macros to generate writing code. # Are these changes tested? Yes, adds some roundtrip tests to validate encoded data can be decoded to the same state. # Are there any user-facing changes? No
1 parent 9596775 commit 1404608

File tree

10 files changed

+1085
-72
lines changed

10 files changed

+1085
-72
lines changed

parquet/src/basic.rs

Lines changed: 316 additions & 59 deletions
Large diffs are not rendered by default.

parquet/src/file/column_crypto_metadata.rs

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,18 @@
1717

1818
//! Column chunk encryption metadata
1919
20+
use std::io::Write;
21+
2022
use crate::errors::{ParquetError, Result};
2123
use crate::format::{
2224
ColumnCryptoMetaData as TColumnCryptoMetaData,
2325
EncryptionWithColumnKey as TEncryptionWithColumnKey,
2426
EncryptionWithFooterKey as TEncryptionWithFooterKey,
2527
};
26-
use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol};
28+
use crate::parquet_thrift::{
29+
ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift,
30+
WriteThriftField,
31+
};
2732
use crate::{thrift_struct, thrift_union};
2833

2934
// define this and ColumnCryptoMetadata here so they're only defined when
@@ -84,6 +89,7 @@ pub fn to_thrift(column_crypto_metadata: &ColumnCryptoMetaData) -> TColumnCrypto
8489
#[cfg(test)]
8590
mod tests {
8691
use super::*;
92+
use crate::parquet_thrift::tests::test_roundtrip;
8793

8894
#[test]
8995
fn test_encryption_with_footer_key_from_thrift() {
@@ -101,4 +107,24 @@ mod tests {
101107

102108
assert_eq!(try_from_thrift(&to_thrift(&metadata)).unwrap(), metadata);
103109
}
110+
111+
#[test]
112+
fn test_column_crypto_roundtrip() {
113+
test_roundtrip(ColumnCryptoMetaData::ENCRYPTION_WITH_FOOTER_KEY);
114+
115+
let path_in_schema = vec!["foo".to_owned(), "bar".to_owned(), "really".to_owned()];
116+
let key_metadata = vec![1u8; 32];
117+
test_roundtrip(ColumnCryptoMetaData::ENCRYPTION_WITH_COLUMN_KEY(
118+
EncryptionWithColumnKey {
119+
path_in_schema: path_in_schema.clone(),
120+
key_metadata: None,
121+
},
122+
));
123+
test_roundtrip(ColumnCryptoMetaData::ENCRYPTION_WITH_COLUMN_KEY(
124+
EncryptionWithColumnKey {
125+
path_in_schema,
126+
key_metadata: Some(key_metadata),
127+
},
128+
));
129+
}
104130
}

parquet/src/file/metadata/mod.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,10 @@ use crate::{
125125
};
126126
use crate::{
127127
basic::{ColumnOrder, Compression, Encoding, Type},
128-
parquet_thrift::{FieldType, ThriftCompactInputProtocol},
128+
parquet_thrift::{
129+
ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
130+
WriteThrift, WriteThriftField,
131+
},
129132
};
130133
use crate::{
131134
data_type::private::ParquetValueType, file::page_index::offset_index::OffsetIndexMetaData,
@@ -135,6 +138,7 @@ use crate::{
135138
thrift_struct,
136139
};
137140
pub use reader::{FooterTail, PageIndexPolicy, ParquetMetaDataReader};
141+
use std::io::Write;
138142
use std::ops::Range;
139143
use std::sync::Arc;
140144
pub use writer::ParquetMetaDataWriter;

parquet/src/file/metadata/thrift_gen.rs

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
// a collection of generated structs used to parse thrift metadata
1919

20+
use std::io::Write;
2021
use std::sync::Arc;
2122

2223
#[cfg(feature = "encryption")]
@@ -33,7 +34,10 @@ use crate::{
3334
page_encoding_stats::PageEncodingStats,
3435
statistics::ValueStatistics,
3536
},
36-
parquet_thrift::{FieldType, ThriftCompactInputProtocol},
37+
parquet_thrift::{
38+
ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
39+
WriteThrift, WriteThriftField,
40+
},
3741
schema::types::{parquet_schema_from_array, ColumnDescriptor, SchemaDescriptor},
3842
thrift_struct,
3943
util::bit_util::FromBytes,
@@ -507,3 +511,45 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ParquetMetaData {
507511
Ok(ParquetMetaData::new(fmd, row_groups))
508512
}
509513
}
514+
515+
#[cfg(test)]
516+
mod tests {
517+
use crate::file::metadata::thrift_gen::BoundingBox;
518+
use crate::parquet_thrift::tests::test_roundtrip;
519+
520+
#[test]
521+
fn test_bounding_box_roundtrip() {
522+
test_roundtrip(BoundingBox {
523+
xmin: 0.1.into(),
524+
xmax: 10.3.into(),
525+
ymin: 0.001.into(),
526+
ymax: 128.5.into(),
527+
zmin: None,
528+
zmax: None,
529+
mmin: None,
530+
mmax: None,
531+
});
532+
533+
test_roundtrip(BoundingBox {
534+
xmin: 0.1.into(),
535+
xmax: 10.3.into(),
536+
ymin: 0.001.into(),
537+
ymax: 128.5.into(),
538+
zmin: Some(11.0.into()),
539+
zmax: Some(1300.0.into()),
540+
mmin: None,
541+
mmax: None,
542+
});
543+
544+
test_roundtrip(BoundingBox {
545+
xmin: 0.1.into(),
546+
xmax: 10.3.into(),
547+
ymin: 0.001.into(),
548+
ymax: 128.5.into(),
549+
zmin: Some(11.0.into()),
550+
zmax: Some(1300.0.into()),
551+
mmin: Some(3.7.into()),
552+
mmax: Some(42.0.into()),
553+
});
554+
}
555+
}

parquet/src/file/page_encoding_stats.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,14 @@
1717

1818
//! Per-page encoding information.
1919
20+
use std::io::Write;
21+
2022
use crate::basic::{Encoding, PageType};
2123
use crate::errors::{ParquetError, Result};
22-
use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol};
24+
use crate::parquet_thrift::{
25+
ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift,
26+
WriteThriftField,
27+
};
2328
use crate::thrift_struct;
2429

2530
// TODO: This should probably all be moved to thrift_gen

parquet/src/file/page_index/index_reader.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,12 @@ use crate::file::page_index::column_index::{
2626
};
2727
use crate::file::page_index::offset_index::OffsetIndexMetaData;
2828
use crate::file::reader::ChunkReader;
29-
use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol};
29+
use crate::parquet_thrift::{
30+
ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift,
31+
WriteThriftField,
32+
};
3033
use crate::thrift_struct;
34+
use std::io::Write;
3135
use std::ops::Range;
3236

3337
/// Computes the covering range of two optional ranges

parquet/src/file/page_index/offset_index.rs

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,12 @@
1919
//!
2020
//! [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
2121
22-
use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol};
22+
use std::io::Write;
23+
24+
use crate::parquet_thrift::{
25+
ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift,
26+
WriteThriftField,
27+
};
2328
use crate::{
2429
errors::{ParquetError, Result},
2530
thrift_struct,
@@ -193,3 +198,36 @@ fn read_page_location<'a>(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<P
193198
first_row_index,
194199
})
195200
}
201+
202+
#[cfg(test)]
203+
mod tests {
204+
use super::*;
205+
use crate::parquet_thrift::tests::test_roundtrip;
206+
207+
#[test]
208+
fn test_offset_idx_roundtrip() {
209+
let page_locations = [
210+
PageLocation {
211+
offset: 0,
212+
compressed_page_size: 10,
213+
first_row_index: 0,
214+
},
215+
PageLocation {
216+
offset: 10,
217+
compressed_page_size: 20,
218+
first_row_index: 100,
219+
},
220+
]
221+
.to_vec();
222+
let unenc = [0i64, 100i64].to_vec();
223+
224+
test_roundtrip(OffsetIndexMetaData {
225+
page_locations: page_locations.clone(),
226+
unencoded_byte_array_data_bytes: Some(unenc),
227+
});
228+
test_roundtrip(OffsetIndexMetaData {
229+
page_locations,
230+
unencoded_byte_array_data_bytes: None,
231+
});
232+
}
233+
}

0 commit comments

Comments
 (0)