Skip to content

Commit fb01048

Browse files
rdbluejulienledem
authored andcommitted
PARQUET-18: Fix all-null value pages with dict encoding.
TestDictionary#testZeroValues demonstrates the problem, where a page of all null values is decoded using the DicitonaryValuesReader. Because there are no non-null values, the page values section is 0 byte, but the DictionaryValuesReader assumes there is at least one encoded value and attempts to read a bit width. The test passes a byte array to initFromPage with the offset equal to the array's length. The fix is to detect that there are no input bytes to read. To avoid adding validity checks to the read path, this sets the internal decoder to one that will throw an exception if any reads are attempted. Author: Ryan Blue <[email protected]> Closes apache#18 from rdblue/PARQUET-18-fix-nulls-with-dictionary and squashes the following commits: 0711766 [Ryan Blue] PARQUET-18: Fix all-null value pages with dict encoding.
1 parent f6c02e2 commit fb01048

File tree

2 files changed

+28
-5
lines changed

2 files changed

+28
-5
lines changed

parquet-column/src/main/java/parquet/column/values/dictionary/DictionaryValuesReader.java

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020
import java.io.ByteArrayInputStream;
2121
import java.io.IOException;
22-
import java.io.InputStream;
2322

2423
import parquet.Log;
2524
import parquet.bytes.BytesUtils;
@@ -51,11 +50,21 @@ public DictionaryValuesReader(Dictionary dictionary) {
5150
@Override
5251
public void initFromPage(int valueCount, byte[] page, int offset)
5352
throws IOException {
54-
if (DEBUG) LOG.debug("init from page at offset "+ offset + " for length " + (page.length - offset));
5553
this.in = new ByteArrayInputStream(page, offset, page.length - offset);
56-
int bitWidth = BytesUtils.readIntLittleEndianOnOneByte(in);
57-
if (DEBUG) LOG.debug("bit width " + bitWidth);
58-
decoder = new RunLengthBitPackingHybridDecoder(bitWidth, in);
54+
if (page.length - offset > 0) {
55+
if (DEBUG)
56+
LOG.debug("init from page at offset " + offset + " for length " + (page.length - offset));
57+
int bitWidth = BytesUtils.readIntLittleEndianOnOneByte(in);
58+
if (DEBUG) LOG.debug("bit width " + bitWidth);
59+
decoder = new RunLengthBitPackingHybridDecoder(bitWidth, in);
60+
} else {
61+
decoder = new RunLengthBitPackingHybridDecoder(1, in) {
62+
@Override
63+
public int readInt() throws IOException {
64+
throw new IOException("Attempt to read from empty page");
65+
}
66+
};
67+
}
5968
}
6069

6170
@Override

parquet-column/src/test/java/parquet/column/values/dictionary/TestDictionary.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,20 @@ public void testFloatDictionaryFallBack() throws IOException {
417417
roundTripFloat(cw, reader, maxDictionaryByteSize);
418418
}
419419

420+
@Test
421+
public void testZeroValues() throws IOException {
422+
DictionaryValuesWriter cw = new PlainIntegerDictionaryValuesWriter(100, 100);
423+
cw.writeInteger(34);
424+
cw.writeInteger(34);
425+
getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
426+
DictionaryValuesReader reader = initDicReader(cw, INT32);
427+
428+
// pretend there are 100 nulls. what matters is offset = bytes.length.
429+
byte[] bytes = {0x00, 0x01, 0x02, 0x03}; // data doesn't matter
430+
int offset = bytes.length;
431+
reader.initFromPage(100, bytes, offset);
432+
}
433+
420434
private DictionaryValuesReader initDicReader(ValuesWriter cw, PrimitiveTypeName type)
421435
throws IOException {
422436
final DictionaryPage dictionaryPage = cw.createDictionaryPage().copy();

0 commit comments

Comments
 (0)