@@ -214,6 +214,14 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
214
214
for (auto encoding : column_metadata_->encodings ) {
215
215
encodings_.push_back (FromThrift (encoding));
216
216
}
217
+ auto fromthrift = [](format::PageEncodingStats page_encoding_stats) {
218
+ return parquet::PageEncodingStats (FromThrift (page_encoding_stats.page_type ),
219
+ FromThrift (page_encoding_stats.encoding ),
220
+ page_encoding_stats.count );
221
+ };
222
+ for (auto encoding_stats : column_metadata_->encoding_stats ) {
223
+ encoding_stats_.push_back (fromthrift (encoding_stats));
224
+ }
217
225
possible_stats_ = nullptr ;
218
226
}
219
227
// column chunk
@@ -257,6 +265,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
257
265
258
266
const std::vector<Encoding::type>& encodings () const { return encodings_; }
259
267
268
+ const std::vector<PageEncodingStats>& encoding_stats () const { return encoding_stats_; }
269
+
260
270
inline bool has_dictionary_page () const {
261
271
return column_metadata_->__isset .dictionary_page_offset ;
262
272
}
@@ -293,6 +303,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
293
303
private:
294
304
mutable std::shared_ptr<Statistics> possible_stats_;
295
305
std::vector<Encoding::type> encodings_;
306
+ std::vector<PageEncodingStats> encoding_stats_;
296
307
const format::ColumnChunk* column_;
297
308
const format::ColumnMetaData* column_metadata_;
298
309
format::ColumnMetaData decrypted_metadata_;
@@ -367,6 +378,10 @@ const std::vector<Encoding::type>& ColumnChunkMetaData::encodings() const {
367
378
return impl_->encodings ();
368
379
}
369
380
381
+ const std::vector<PageEncodingStats>& ColumnChunkMetaData::encoding_stats () const {
382
+ return impl_->encoding_stats ();
383
+ }
384
+
370
385
int64_t ColumnChunkMetaData::total_uncompressed_size () const {
371
386
return impl_->total_uncompressed_size ();
372
387
}
@@ -966,7 +981,9 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
966
981
void Finish (int64_t num_values, int64_t dictionary_page_offset,
967
982
int64_t index_page_offset, int64_t data_page_offset,
968
983
int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
969
- bool dictionary_fallback, const std::shared_ptr<Encryptor>& encryptor) {
984
+ bool dictionary_fallback, int32_t num_dict_pages,
985
+ std::map<Encoding::type, int32_t >& num_data_pages,
986
+ const std::shared_ptr<Encryptor>& encryptor) {
970
987
if (dictionary_page_offset > 0 ) {
971
988
column_chunk_->meta_data .__set_dictionary_page_offset (dictionary_page_offset);
972
989
column_chunk_->__set_file_offset (dictionary_page_offset + compressed_size);
@@ -983,23 +1000,62 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
983
1000
column_chunk_->meta_data .__set_total_compressed_size (compressed_size);
984
1001
985
1002
std::vector<format::Encoding::type> thrift_encodings;
1003
+ std::vector<format::PageEncodingStats> thrift_encoding_stats;
986
1004
if (has_dictionary) {
987
1005
thrift_encodings.push_back (ToThrift (properties_->dictionary_index_encoding ()));
1006
+ format::PageEncodingStats dict_page_stats;
1007
+ dict_page_stats.__set_page_type (format::PageType::DICTIONARY_PAGE);
1008
+ dict_page_stats.__set_encoding (ToThrift (properties_->dictionary_index_encoding ()));
1009
+ dict_page_stats.__set_count (num_dict_pages);
1010
+ thrift_encoding_stats.push_back (dict_page_stats);
1011
+ // Add DataPage stats
1012
+ format::PageEncodingStats data_page_stats;
988
1013
if (properties_->version () == ParquetVersion::PARQUET_1_0) {
989
1014
thrift_encodings.push_back (ToThrift (Encoding::PLAIN));
1015
+ data_page_stats.__set_page_type (format::PageType::DATA_PAGE);
1016
+ data_page_stats.__set_encoding (ToThrift (Encoding::PLAIN));
1017
+ data_page_stats.__set_count (num_data_pages[Encoding::PLAIN]);
990
1018
} else {
991
1019
thrift_encodings.push_back (ToThrift (properties_->dictionary_page_encoding ()));
1020
+ data_page_stats.__set_page_type (format::PageType::DATA_PAGE);
1021
+ data_page_stats.__set_encoding (ToThrift (properties_->dictionary_page_encoding ()));
1022
+ data_page_stats.__set_count (
1023
+ num_data_pages[properties_->dictionary_page_encoding ()]);
992
1024
}
1025
+ thrift_encoding_stats.push_back (data_page_stats);
993
1026
} else { // Dictionary not enabled
994
1027
thrift_encodings.push_back (ToThrift (properties_->encoding (column_->path ())));
1028
+ // Add DataPage stats
1029
+ format::PageEncodingStats data_page_stats;
1030
+ data_page_stats.__set_page_type (format::PageType::DATA_PAGE);
1031
+ if (column_->physical_type () == Type::BOOLEAN) {
1032
+ data_page_stats.__set_encoding (ToThrift (Encoding::PLAIN));
1033
+ data_page_stats.__set_count (num_data_pages[Encoding::PLAIN]);
1034
+ } else {
1035
+ data_page_stats.__set_encoding (ToThrift (properties_->encoding (column_->path ())));
1036
+ data_page_stats.__set_count (
1037
+ num_data_pages[properties_->encoding (column_->path ())]);
1038
+ }
1039
+ thrift_encoding_stats.push_back (data_page_stats);
995
1040
}
996
1041
thrift_encodings.push_back (ToThrift (Encoding::RLE));
1042
+ format::PageEncodingStats data_page_stats;
1043
+ data_page_stats.__set_page_type (format::PageType::DATA_PAGE);
1044
+ data_page_stats.__set_encoding (ToThrift (Encoding::RLE));
1045
+ data_page_stats.__set_count (num_data_pages[Encoding::RLE]);
1046
+ thrift_encoding_stats.push_back (data_page_stats);
997
1047
// Only PLAIN encoding is supported for fallback in V1
998
1048
// TODO(majetideepak): Use user specified encoding for V2
999
1049
if (dictionary_fallback) {
1000
1050
thrift_encodings.push_back (ToThrift (Encoding::PLAIN));
1051
+ format::PageEncodingStats fallback_page_stats;
1052
+ fallback_page_stats.__set_page_type (format::PageType::DATA_PAGE);
1053
+ fallback_page_stats.__set_encoding (ToThrift (Encoding::PLAIN));
1054
+ fallback_page_stats.__set_count (num_data_pages[Encoding::PLAIN]);
1055
+ thrift_encoding_stats.push_back (fallback_page_stats);
1001
1056
}
1002
1057
column_chunk_->meta_data .__set_encodings (thrift_encodings);
1058
+ column_chunk_->meta_data .__set_encoding_stats (thrift_encoding_stats);
1003
1059
1004
1060
const auto & encrypt_md =
1005
1061
properties_->column_encryption_properties (column_->path ()->ToDotString ());
@@ -1122,11 +1178,12 @@ void ColumnChunkMetaDataBuilder::Finish(int64_t num_values,
1122
1178
int64_t index_page_offset,
1123
1179
int64_t data_page_offset, int64_t compressed_size,
1124
1180
int64_t uncompressed_size, bool has_dictionary,
1125
- bool dictionary_fallback,
1181
+ bool dictionary_fallback, int32_t num_dict_pages,
1182
+ std::map<Encoding::type, int32_t > num_data_pages,
1126
1183
const std::shared_ptr<Encryptor>& encryptor) {
1127
1184
impl_->Finish (num_values, dictionary_page_offset, index_page_offset, data_page_offset,
1128
1185
compressed_size, uncompressed_size, has_dictionary, dictionary_fallback,
1129
- encryptor);
1186
+ num_dict_pages, num_data_pages, encryptor);
1130
1187
}
1131
1188
1132
1189
void ColumnChunkMetaDataBuilder::WriteTo (::arrow::io::OutputStream* sink) {
0 commit comments