1515// specific language governing permissions and limitations
1616// under the License.
1717
18+ use rand:: Rng ;
19+ use thrift:: protocol:: TCompactOutputProtocol ;
20+
21+ use arrow:: util:: test_util:: seedable_rng;
1822use bytes:: Bytes ;
1923use criterion:: * ;
2024use parquet:: file:: reader:: SerializedFileReader ;
2125use parquet:: file:: serialized_reader:: ReadOptionsBuilder ;
26+ use parquet:: format:: {
27+ ColumnChunk , ColumnMetaData , CompressionCodec , Encoding , FieldRepetitionType , FileMetaData ,
28+ RowGroup , SchemaElement , Type ,
29+ } ;
30+ use parquet:: thrift:: TSerializable ;
31+
32+ const NUM_COLUMNS : usize = 10_000 ;
33+ const NUM_ROW_GROUPS : usize = 10 ;
34+
35+ fn encoded_meta ( ) -> Vec < u8 > {
36+ let mut rng = seedable_rng ( ) ;
37+
38+ let mut schema = Vec :: with_capacity ( NUM_COLUMNS + 1 ) ;
39+ schema. push ( SchemaElement {
40+ type_ : None ,
41+ type_length : None ,
42+ repetition_type : None ,
43+ name : Default :: default ( ) ,
44+ num_children : Some ( NUM_COLUMNS as _ ) ,
45+ converted_type : None ,
46+ scale : None ,
47+ precision : None ,
48+ field_id : None ,
49+ logical_type : None ,
50+ } ) ;
51+ for i in 0 ..NUM_COLUMNS {
52+ schema. push ( SchemaElement {
53+ type_ : Some ( Type :: FLOAT ) ,
54+ type_length : None ,
55+ repetition_type : Some ( FieldRepetitionType :: REQUIRED ) ,
56+ name : i. to_string ( ) ,
57+ num_children : None ,
58+ converted_type : None ,
59+ scale : None ,
60+ precision : None ,
61+ field_id : None ,
62+ logical_type : None ,
63+ } )
64+ }
65+
66+ let stats = parquet:: format:: Statistics {
67+ min : None ,
68+ max : None ,
69+ null_count : Some ( 0 ) ,
70+ distinct_count : None ,
71+ max_value : Some ( vec ! [ rng. random( ) ; 8 ] ) ,
72+ min_value : Some ( vec ! [ rng. random( ) ; 8 ] ) ,
73+ is_max_value_exact : Some ( true ) ,
74+ is_min_value_exact : Some ( true ) ,
75+ } ;
76+
77+ let row_groups = ( 0 ..NUM_ROW_GROUPS )
78+ . map ( |i| {
79+ let columns = ( 0 ..NUM_COLUMNS )
80+ . map ( |_| ColumnChunk {
81+ file_path : None ,
82+ file_offset : 0 ,
83+ meta_data : Some ( ColumnMetaData {
84+ type_ : Type :: FLOAT ,
85+ encodings : vec ! [ Encoding :: PLAIN , Encoding :: RLE_DICTIONARY ] ,
86+ path_in_schema : vec ! [ ] ,
87+ codec : CompressionCodec :: UNCOMPRESSED ,
88+ num_values : rng. random ( ) ,
89+ total_uncompressed_size : rng. random ( ) ,
90+ total_compressed_size : rng. random ( ) ,
91+ key_value_metadata : None ,
92+ data_page_offset : rng. random ( ) ,
93+ index_page_offset : Some ( rng. random ( ) ) ,
94+ dictionary_page_offset : Some ( rng. random ( ) ) ,
95+ statistics : Some ( stats. clone ( ) ) ,
96+ encoding_stats : None ,
97+ bloom_filter_offset : None ,
98+ bloom_filter_length : None ,
99+ size_statistics : None ,
100+ geospatial_statistics : None ,
101+ } ) ,
102+ offset_index_offset : Some ( rng. random ( ) ) ,
103+ offset_index_length : Some ( rng. random ( ) ) ,
104+ column_index_offset : Some ( rng. random ( ) ) ,
105+ column_index_length : Some ( rng. random ( ) ) ,
106+ crypto_metadata : None ,
107+ encrypted_column_metadata : None ,
108+ } )
109+ . collect ( ) ;
110+
111+ RowGroup {
112+ columns,
113+ total_byte_size : rng. random ( ) ,
114+ num_rows : rng. random ( ) ,
115+ sorting_columns : None ,
116+ file_offset : None ,
117+ total_compressed_size : Some ( rng. random ( ) ) ,
118+ ordinal : Some ( i as _ ) ,
119+ }
120+ } )
121+ . collect ( ) ;
122+
123+ let file = FileMetaData {
124+ schema,
125+ row_groups,
126+ version : 1 ,
127+ num_rows : rng. random ( ) ,
128+ key_value_metadata : None ,
129+ created_by : Some ( "parquet-rs" . into ( ) ) ,
130+ column_orders : None ,
131+ encryption_algorithm : None ,
132+ footer_signing_key_metadata : None ,
133+ } ;
134+
135+ let mut buf = Vec :: with_capacity ( 1024 ) ;
136+ {
137+ let mut out = TCompactOutputProtocol :: new ( & mut buf) ;
138+ file. write_to_out_protocol ( & mut out) . unwrap ( ) ;
139+ }
140+ buf
141+ }
142+
143+ fn get_footer_bytes ( data : Bytes ) -> Bytes {
144+ let footer_bytes = data. slice ( data. len ( ) - 8 ..) ;
145+ let footer_len = footer_bytes[ 0 ] as u32
146+ | ( footer_bytes[ 1 ] as u32 ) << 8
147+ | ( footer_bytes[ 2 ] as u32 ) << 16
148+ | ( footer_bytes[ 3 ] as u32 ) << 24 ;
149+ let meta_start = data. len ( ) - footer_len as usize - 8 ;
150+ let meta_end = data. len ( ) - 8 ;
151+ data. slice ( meta_start..meta_end)
152+ }
22153
23154fn criterion_benchmark ( c : & mut Criterion ) {
24155 // Read file into memory to isolate filesystem performance
@@ -36,6 +167,20 @@ fn criterion_benchmark(c: &mut Criterion) {
36167 SerializedFileReader :: new_with_options ( data. clone ( ) , options) . unwrap ( )
37168 } )
38169 } ) ;
170+
171+ let meta_data = get_footer_bytes ( data) ;
172+ c. bench_function ( "decode file metadata" , |b| {
173+ b. iter ( || {
174+ parquet:: thrift:: bench_file_metadata ( & meta_data) ;
175+ } )
176+ } ) ;
177+
178+ let buf = black_box ( encoded_meta ( ) ) . into ( ) ;
179+ c. bench_function ( "decode file metadata (wide)" , |b| {
180+ b. iter ( || {
181+ parquet:: thrift:: bench_file_metadata ( & buf) ;
182+ } )
183+ } ) ;
39184}
40185
41186criterion_group ! ( benches, criterion_benchmark) ;
0 commit comments