Skip to content

Commit b4acb0b

Browse files
omega-bigstreamwesm
andcommitted
PARQUET-1780: [C++] Set ColumnMetadata.encoding_stats field
This is to solve the issue PARQUET-1780: ColumnMetadata.encoding_stats field is empty in parquet-cpp implementation. This leads to metadata mismatches between 2 parquet files generated by cpp and scala(parquet-mr). encoding_stat is a vector of **PageEncodingStats**. PageEncodingStats has three attributes: - page_type: (data or dict) - encoding: encoding of the page - count:number of pages of this type with this encoding From above first to can be extracted from available information. But for count I have to create a add some attributes to exisiting classes. Modifications: For the class **SerializedPageWriter**, added following two attributes. int32_t num_dict_pages_; std::pair<int32_t, int32_t> num_data_pages_; (first: number of un-encoded pages, second:number of encoded pages ) Closes #6370 from omega-gamage/PARQUET-1780 and squashes the following commits: 086af4e <Wes McKinney> Code review comments a9c684b <Omega Gamage> Match the implementation with impala implementation eae56fa <Wes McKinney> Simplify PageEncodingStats 54ac1eb <Omega Gamage> commit 9eecaaf Author: Omega Gamage <[email protected]> Date: Tue Feb 18 14:23:08 2020 +0530 Lead-authored-by: Omega Gamage <[email protected]> Co-authored-by: Wes McKinney <[email protected]> Signed-off-by: Wes McKinney <[email protected]>
1 parent 21c4d4b commit b4acb0b

File tree

6 files changed

+116
-17
lines changed

6 files changed

+116
-17
lines changed

cpp/src/parquet/column_writer.cc

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <algorithm>
2121
#include <cstdint>
2222
#include <cstring>
23+
#include <map>
2324
#include <memory>
2425
#include <string>
2526
#include <utility>
@@ -226,6 +227,7 @@ class SerializedPageWriter : public PageWriter {
226227

227228
total_uncompressed_size_ += uncompressed_size + header_size;
228229
total_compressed_size_ += output_data_len + header_size;
230+
++dict_encoding_stats_[page.encoding()];
229231

230232
PARQUET_ASSIGN_OR_THROW(int64_t final_pos, sink_->Tell());
231233
return final_pos - start_pos;
@@ -238,7 +240,8 @@ class SerializedPageWriter : public PageWriter {
238240
// index_page_offset = -1 since they are not supported
239241
metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_,
240242
total_compressed_size_, total_uncompressed_size_, has_dictionary,
241-
fallback, meta_encryptor_);
243+
fallback, dict_encoding_stats_, data_encoding_stats_,
244+
meta_encryptor_);
242245
// Write metadata at end of column chunk
243246
metadata_->WriteTo(sink_.get());
244247
}
@@ -310,7 +313,7 @@ class SerializedPageWriter : public PageWriter {
310313
total_uncompressed_size_ += uncompressed_size + header_size;
311314
total_compressed_size_ += output_data_len + header_size;
312315
num_values_ += page.num_values();
313-
316+
++data_encoding_stats_[page.encoding()];
314317
++page_ordinal_;
315318
PARQUET_ASSIGN_OR_THROW(int64_t current_pos, sink_->Tell());
316319
return current_pos - start_pos;
@@ -405,6 +408,9 @@ class SerializedPageWriter : public PageWriter {
405408
std::shared_ptr<Encryptor> data_encryptor_;
406409

407410
std::shared_ptr<ResizableBuffer> encryption_buffer_;
411+
412+
std::map<Encoding::type, int32_t> dict_encoding_stats_;
413+
std::map<Encoding::type, int32_t> data_encoding_stats_;
408414
};
409415

410416
// This implementation of the PageWriter writes to the final sink on Close .
@@ -441,7 +447,8 @@ class BufferedPageWriter : public PageWriter {
441447
metadata_->Finish(pager_->num_values(), dictionary_page_offset, -1,
442448
pager_->data_page_offset() + final_position,
443449
pager_->total_compressed_size(), pager_->total_uncompressed_size(),
444-
has_dictionary, fallback, pager_->meta_encryptor_);
450+
has_dictionary, fallback, pager_->dict_encoding_stats_,
451+
pager_->data_encoding_stats_, pager_->meta_encryptor_);
445452

446453
// Write metadata at end of column chunk
447454
metadata_->WriteTo(in_memory_sink_.get());

cpp/src/parquet/column_writer_test.cc

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,31 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
185185
{Encoding::RLE_DICTIONARY, Encoding::PLAIN, Encoding::RLE, Encoding::PLAIN});
186186
ASSERT_EQ(encodings, expected);
187187
}
188+
189+
std::vector<parquet::PageEncodingStats> encoding_stats =
190+
this->metadata_encoding_stats();
191+
if (this->type_num() == Type::BOOLEAN) {
192+
ASSERT_EQ(encoding_stats[0].encoding, Encoding::PLAIN);
193+
ASSERT_EQ(encoding_stats[0].page_type, PageType::DATA_PAGE);
194+
} else if (version == ParquetVersion::PARQUET_1_0) {
195+
std::vector<Encoding::type> expected(
196+
{Encoding::PLAIN_DICTIONARY, Encoding::PLAIN, Encoding::PLAIN_DICTIONARY});
197+
ASSERT_EQ(encoding_stats[0].encoding, expected[0]);
198+
ASSERT_EQ(encoding_stats[0].page_type, PageType::DICTIONARY_PAGE);
199+
for (size_t i = 1; i < encoding_stats.size(); i++) {
200+
ASSERT_EQ(encoding_stats[i].encoding, expected[i]);
201+
ASSERT_EQ(encoding_stats[i].page_type, PageType::DATA_PAGE);
202+
}
203+
} else {
204+
std::vector<Encoding::type> expected(
205+
{Encoding::PLAIN, Encoding::PLAIN, Encoding::RLE_DICTIONARY});
206+
ASSERT_EQ(encoding_stats[0].encoding, expected[0]);
207+
ASSERT_EQ(encoding_stats[0].page_type, PageType::DICTIONARY_PAGE);
208+
for (size_t i = 1; i < encoding_stats.size(); i++) {
209+
ASSERT_EQ(encoding_stats[i].encoding, expected[i]);
210+
ASSERT_EQ(encoding_stats[i].page_type, PageType::DATA_PAGE);
211+
}
212+
}
188213
}
189214

190215
void WriteRequiredWithSettings(Encoding::type encoding, Compression::type compression,
@@ -273,6 +298,15 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
273298
return metadata_accessor->encodings();
274299
}
275300

301+
std::vector<parquet::PageEncodingStats> metadata_encoding_stats() {
302+
// Metadata accessor must be created lazily.
303+
// This is because the ColumnChunkMetaData semantics dictate the metadata object is
304+
// complete (no changes to the metadata buffer can be made after instantiation)
305+
auto metadata_accessor =
306+
ColumnChunkMetaData::Make(metadata_->contents(), this->descr_);
307+
return metadata_accessor->encoding_stats();
308+
}
309+
276310
protected:
277311
int64_t values_read_;
278312
// Keep the reader alive as for ByteArray the lifetime of the ByteArray

cpp/src/parquet/metadata.cc

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,11 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
214214
for (auto encoding : column_metadata_->encodings) {
215215
encodings_.push_back(FromThrift(encoding));
216216
}
217+
for (auto encoding_stats : column_metadata_->encoding_stats) {
218+
encoding_stats_.push_back({FromThrift(encoding_stats.page_type),
219+
FromThrift(encoding_stats.encoding),
220+
encoding_stats.count});
221+
}
217222
possible_stats_ = nullptr;
218223
}
219224
// column chunk
@@ -257,6 +262,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
257262

258263
const std::vector<Encoding::type>& encodings() const { return encodings_; }
259264

265+
const std::vector<PageEncodingStats>& encoding_stats() const { return encoding_stats_; }
266+
260267
inline bool has_dictionary_page() const {
261268
return column_metadata_->__isset.dictionary_page_offset;
262269
}
@@ -293,6 +300,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
293300
private:
294301
mutable std::shared_ptr<Statistics> possible_stats_;
295302
std::vector<Encoding::type> encodings_;
303+
std::vector<PageEncodingStats> encoding_stats_;
296304
const format::ColumnChunk* column_;
297305
const format::ColumnMetaData* column_metadata_;
298306
format::ColumnMetaData decrypted_metadata_;
@@ -367,6 +375,10 @@ const std::vector<Encoding::type>& ColumnChunkMetaData::encodings() const {
367375
return impl_->encodings();
368376
}
369377

378+
const std::vector<PageEncodingStats>& ColumnChunkMetaData::encoding_stats() const {
379+
return impl_->encoding_stats();
380+
}
381+
370382
int64_t ColumnChunkMetaData::total_uncompressed_size() const {
371383
return impl_->total_uncompressed_size();
372384
}
@@ -966,7 +978,10 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
966978
void Finish(int64_t num_values, int64_t dictionary_page_offset,
967979
int64_t index_page_offset, int64_t data_page_offset,
968980
int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
969-
bool dictionary_fallback, const std::shared_ptr<Encryptor>& encryptor) {
981+
bool dictionary_fallback,
982+
const std::map<Encoding::type, int32_t>& dict_encoding_stats,
983+
const std::map<Encoding::type, int32_t>& data_encoding_stats,
984+
const std::shared_ptr<Encryptor>& encryptor) {
970985
if (dictionary_page_offset > 0) {
971986
column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset);
972987
column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size);
@@ -1000,6 +1015,24 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
10001015
thrift_encodings.push_back(ToThrift(Encoding::PLAIN));
10011016
}
10021017
column_chunk_->meta_data.__set_encodings(thrift_encodings);
1018+
std::vector<format::PageEncodingStats> thrift_encoding_stats;
1019+
// Add dictionary page encoding stats
1020+
for (const auto& entry : dict_encoding_stats) {
1021+
format::PageEncodingStats dict_enc_stat;
1022+
dict_enc_stat.__set_page_type(format::PageType::DICTIONARY_PAGE);
1023+
dict_enc_stat.__set_encoding(ToThrift(entry.first));
1024+
dict_enc_stat.__set_count(entry.second);
1025+
thrift_encoding_stats.push_back(dict_enc_stat);
1026+
}
1027+
// Add data page encoding stats
1028+
for (const auto& entry : data_encoding_stats) {
1029+
format::PageEncodingStats data_enc_stat;
1030+
data_enc_stat.__set_page_type(format::PageType::DATA_PAGE);
1031+
data_enc_stat.__set_encoding(ToThrift(entry.first));
1032+
data_enc_stat.__set_count(entry.second);
1033+
thrift_encoding_stats.push_back(data_enc_stat);
1034+
}
1035+
column_chunk_->meta_data.__set_encoding_stats(thrift_encoding_stats);
10031036

10041037
const auto& encrypt_md =
10051038
properties_->column_encryption_properties(column_->path()->ToDotString());
@@ -1117,16 +1150,16 @@ void ColumnChunkMetaDataBuilder::set_file_path(const std::string& path) {
11171150
impl_->set_file_path(path);
11181151
}
11191152

1120-
void ColumnChunkMetaDataBuilder::Finish(int64_t num_values,
1121-
int64_t dictionary_page_offset,
1122-
int64_t index_page_offset,
1123-
int64_t data_page_offset, int64_t compressed_size,
1124-
int64_t uncompressed_size, bool has_dictionary,
1125-
bool dictionary_fallback,
1126-
const std::shared_ptr<Encryptor>& encryptor) {
1153+
void ColumnChunkMetaDataBuilder::Finish(
1154+
int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset,
1155+
int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size,
1156+
bool has_dictionary, bool dictionary_fallback,
1157+
const std::map<Encoding::type, int32_t>& dict_encoding_stats,
1158+
const std::map<Encoding::type, int32_t>& data_encoding_stats,
1159+
const std::shared_ptr<Encryptor>& encryptor) {
11271160
impl_->Finish(num_values, dictionary_page_offset, index_page_offset, data_page_offset,
11281161
compressed_size, uncompressed_size, has_dictionary, dictionary_fallback,
1129-
encryptor);
1162+
dict_encoding_stats, data_encoding_stats, encryptor);
11301163
}
11311164

11321165
void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink) {

cpp/src/parquet/metadata.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include <map>
2222
#include <memory>
2323
#include <string>
24+
#include <utility>
2425
#include <vector>
2526

2627
#include "arrow/util/key_value_metadata.h"
@@ -119,6 +120,13 @@ class PARQUET_EXPORT ColumnCryptoMetaData {
119120
std::unique_ptr<ColumnCryptoMetaDataImpl> impl_;
120121
};
121122

123+
/// \brief Public struct for Thrift PageEncodingStats in ColumnChunkMetaData
124+
struct PageEncodingStats {
125+
PageType::type page_type;
126+
Encoding::type encoding;
127+
int32_t count;
128+
};
129+
122130
class PARQUET_EXPORT ColumnChunkMetaData {
123131
public:
124132
// API convenience to get a MetaData accessor
@@ -150,6 +158,7 @@ class PARQUET_EXPORT ColumnChunkMetaData {
150158
bool can_decompress() const;
151159

152160
const std::vector<Encoding::type>& encodings() const;
161+
const std::vector<PageEncodingStats>& encoding_stats() const;
153162
bool has_dictionary_page() const;
154163
int64_t dictionary_page_offset() const;
155164
int64_t data_page_offset() const;
@@ -320,6 +329,8 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
320329
int64_t index_page_offset, int64_t data_page_offset,
321330
int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
322331
bool dictionary_fallback,
332+
const std::map<Encoding::type, int32_t>& dict_encoding_stats_,
333+
const std::map<Encoding::type, int32_t>& data_encoding_stats_,
323334
const std::shared_ptr<Encryptor>& encryptor = NULLPTR);
324335

325336
// The metadata contents, suitable for passing to ColumnChunkMetaData::Make

cpp/src/parquet/metadata_test.cc

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,18 +35,22 @@ std::unique_ptr<parquet::FileMetaData> GenerateTableMetaData(
3535
EncodedStatistics stats_int, EncodedStatistics stats_float) {
3636
auto f_builder = FileMetaDataBuilder::Make(&schema, props);
3737
auto rg1_builder = f_builder->AppendRowGroup();
38-
3938
// Write the metadata
4039
// rowgroup1 metadata
4140
auto col1_builder = rg1_builder->NextColumnChunk();
4241
auto col2_builder = rg1_builder->NextColumnChunk();
4342
// column metadata
43+
std::map<Encoding::type, int32_t> dict_encoding_stats({{Encoding::RLE_DICTIONARY, 1}});
44+
std::map<Encoding::type, int32_t> data_encoding_stats(
45+
{{Encoding::PLAIN, 1}, {Encoding::RLE, 1}});
4446
stats_int.set_is_signed(true);
4547
col1_builder->SetStatistics(stats_int);
4648
stats_float.set_is_signed(true);
4749
col2_builder->SetStatistics(stats_float);
48-
col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false);
49-
col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false);
50+
col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false, dict_encoding_stats,
51+
data_encoding_stats);
52+
col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false, dict_encoding_stats,
53+
data_encoding_stats);
5054

5155
rg1_builder->set_num_rows(nrows / 2);
5256
rg1_builder->Finish(1024);
@@ -58,8 +62,10 @@ std::unique_ptr<parquet::FileMetaData> GenerateTableMetaData(
5862
// column metadata
5963
col1_builder->SetStatistics(stats_int);
6064
col2_builder->SetStatistics(stats_float);
61-
col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false);
62-
col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false);
65+
col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false, dict_encoding_stats,
66+
data_encoding_stats);
67+
col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false, dict_encoding_stats,
68+
data_encoding_stats);
6369

6470
rg2_builder->set_num_rows(nrows / 2);
6571
rg2_builder->Finish(1024);
@@ -155,6 +161,8 @@ TEST(Metadata, TestBuildAccess) {
155161
ASSERT_EQ(24, rg1_column2->dictionary_page_offset());
156162
ASSERT_EQ(10, rg1_column1->data_page_offset());
157163
ASSERT_EQ(30, rg1_column2->data_page_offset());
164+
ASSERT_EQ(3, rg1_column1->encoding_stats().size());
165+
ASSERT_EQ(3, rg1_column2->encoding_stats().size());
158166

159167
auto rg2_accessor = f_accessors[loop_index]->RowGroup(1);
160168
ASSERT_EQ(2, rg2_accessor->num_columns());
@@ -187,6 +195,8 @@ TEST(Metadata, TestBuildAccess) {
187195
ASSERT_EQ(16, rg2_column2->dictionary_page_offset());
188196
ASSERT_EQ(10, rg2_column1->data_page_offset());
189197
ASSERT_EQ(26, rg2_column2->data_page_offset());
198+
ASSERT_EQ(3, rg2_column1->encoding_stats().size());
199+
ASSERT_EQ(3, rg2_column2->encoding_stats().size());
190200

191201
// Test FileMetaData::set_file_path
192202
ASSERT_TRUE(rg2_column1->file_path().empty());

cpp/src/parquet/thrift_internal.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,10 @@ static inline Encoding::type FromThrift(format::Encoding::type type) {
8282
return static_cast<Encoding::type>(type);
8383
}
8484

85+
static inline PageType::type FromThrift(format::PageType::type type) {
86+
return static_cast<PageType::type>(type);
87+
}
88+
8589
static inline AadMetadata FromThrift(format::AesGcmV1 aesGcmV1) {
8690
return AadMetadata{aesGcmV1.aad_prefix, aesGcmV1.aad_file_unique,
8791
aesGcmV1.supply_aad_prefix};

0 commit comments

Comments
 (0)