diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java index ecd104327f..52e1458fb6 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java @@ -189,7 +189,10 @@ public > Boolean visit(NotEq notEq) { try { Set dictSet = expandDictionary(meta); - if (dictSet != null && dictSet.size() == 1 && dictSet.contains(value)) { + boolean mayContainNull = (meta.getStatistics() == null + || !meta.getStatistics().isNumNullsSet() + || meta.getStatistics().getNumNulls() > 0); + if (dictSet != null && dictSet.size() == 1 && dictSet.contains(value) && !mayContainNull) { return BLOCK_CANNOT_MATCH; } } catch (IOException e) { diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java index 39db6d4be8..6af4437249 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java @@ -88,6 +88,7 @@ public class DictionaryFilterTest { "message test { " + "required binary binary_field; " + "required binary single_value_field; " + + "optional binary optional_single_value_field; " + "required fixed_len_byte_array(17) fixed_field (DECIMAL(40,4)); " + "required int32 int32_field; " + "required int64 int64_field; " @@ -165,6 +166,11 @@ private static void writeData(SimpleGroupFactory f, ParquetWriter writer) ALPHABET.substring(index, index+1) : UUID.randomUUID().toString()) .append("int96_field", INT96_VALUES[i % INT96_VALUES.length]); + // 10% of the time, leave the field null + if (index % 10 > 0) { + group.append("optional_single_value_field", "sharp"); + } + writer.write(group); } writer.close(); @@ -256,7 +262,7 @@ public void testDictionaryEncodedColumns() throws Exception { @SuppressWarnings("deprecation") private void testDictionaryEncodedColumnsV1() throws Exception { Set dictionaryEncodedColumns = new HashSet(Arrays.asList( - "binary_field", "single_value_field", "int32_field", "int64_field", + "binary_field", "single_value_field", "optional_single_value_field", "int32_field", "int64_field", "double_field", "float_field", "int96_field")); for (ColumnChunkMetaData column : ccmd) { String name = column.getPath().toDotString(); @@ -281,7 +287,7 @@ private void testDictionaryEncodedColumnsV1() throws Exception { private void testDictionaryEncodedColumnsV2() throws Exception { Set dictionaryEncodedColumns = new HashSet(Arrays.asList( - "binary_field", "single_value_field", "fixed_field", "int32_field", + "binary_field", "single_value_field", "optional_single_value_field", "fixed_field", "int32_field", "int64_field", "double_field", "float_field", "int96_field")); for (ColumnChunkMetaData column : ccmd) { EncodingStats encStats = column.getEncodingStats(); @@ -355,6 +361,7 @@ public void testEqInt96() throws Exception { @Test public void testNotEqBinary() throws Exception { BinaryColumn sharp = binaryColumn("single_value_field"); + BinaryColumn sharpAndNull = binaryColumn("optional_single_value_field"); BinaryColumn b = binaryColumn("binary_field"); assertTrue("Should drop block with only the excluded value", @@ -363,6 +370,12 @@ public void testNotEqBinary() throws Exception { assertFalse("Should not drop block with any other value", canDrop(notEq(sharp, Binary.fromString("applause")), ccmd, dictionaries)); + assertFalse("Should not drop block with only the excluded value and null", + canDrop(notEq(sharpAndNull, Binary.fromString("sharp")), ccmd, dictionaries)); + + assertFalse("Should not drop block with any other value", + canDrop(notEq(sharpAndNull, Binary.fromString("applause")), ccmd, dictionaries)); + assertFalse("Should not drop block with a known value", canDrop(notEq(b, Binary.fromString("x")), ccmd, dictionaries));