From d095649b4c2dd24ea0cc1cf8cdc97df821e27c66 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Fri, 25 Jan 2019 11:08:49 -0800 Subject: [PATCH] PARQUET-1510: Fix notEq for optional columns with null values. Dictionaries cannot contain null values, so notEq filters cannot conclude that a block cannot match using only the dictionary. Instead, it must also check whether the block may have at least one null value. If there are no null values, then the existing check is correct. --- .../dictionarylevel/DictionaryFilter.java | 5 ++++- .../dictionarylevel/DictionaryFilterTest.java | 17 +++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java index ecd104327f..52e1458fb6 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java @@ -189,7 +189,10 @@ public > Boolean visit(NotEq notEq) { try { Set dictSet = expandDictionary(meta); - if (dictSet != null && dictSet.size() == 1 && dictSet.contains(value)) { + boolean mayContainNull = (meta.getStatistics() == null + || !meta.getStatistics().isNumNullsSet() + || meta.getStatistics().getNumNulls() > 0); + if (dictSet != null && dictSet.size() == 1 && dictSet.contains(value) && !mayContainNull) { return BLOCK_CANNOT_MATCH; } } catch (IOException e) { diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java index 39db6d4be8..6af4437249 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java @@ -88,6 +88,7 @@ public class DictionaryFilterTest { "message test { " + "required binary binary_field; " + "required binary single_value_field; " + + "optional binary optional_single_value_field; " + "required fixed_len_byte_array(17) fixed_field (DECIMAL(40,4)); " + "required int32 int32_field; " + "required int64 int64_field; " @@ -165,6 +166,11 @@ private static void writeData(SimpleGroupFactory f, ParquetWriter writer) ALPHABET.substring(index, index+1) : UUID.randomUUID().toString()) .append("int96_field", INT96_VALUES[i % INT96_VALUES.length]); + // 10% of the time, leave the field null + if (index % 10 > 0) { + group.append("optional_single_value_field", "sharp"); + } + writer.write(group); } writer.close(); @@ -256,7 +262,7 @@ public void testDictionaryEncodedColumns() throws Exception { @SuppressWarnings("deprecation") private void testDictionaryEncodedColumnsV1() throws Exception { Set dictionaryEncodedColumns = new HashSet(Arrays.asList( - "binary_field", "single_value_field", "int32_field", "int64_field", + "binary_field", "single_value_field", "optional_single_value_field", "int32_field", "int64_field", "double_field", "float_field", "int96_field")); for (ColumnChunkMetaData column : ccmd) { String name = column.getPath().toDotString(); @@ -281,7 +287,7 @@ private void testDictionaryEncodedColumnsV1() throws Exception { private void testDictionaryEncodedColumnsV2() throws Exception { Set dictionaryEncodedColumns = new HashSet(Arrays.asList( - "binary_field", "single_value_field", "fixed_field", "int32_field", + "binary_field", "single_value_field", "optional_single_value_field", "fixed_field", "int32_field", "int64_field", "double_field", "float_field", "int96_field")); for (ColumnChunkMetaData column : ccmd) { EncodingStats encStats = column.getEncodingStats(); @@ -355,6 +361,7 @@ public void testEqInt96() throws Exception { @Test public void testNotEqBinary() throws Exception { BinaryColumn sharp = binaryColumn("single_value_field"); + BinaryColumn sharpAndNull = binaryColumn("optional_single_value_field"); BinaryColumn b = binaryColumn("binary_field"); assertTrue("Should drop block with only the excluded value", @@ -363,6 +370,12 @@ public void testNotEqBinary() throws Exception { assertFalse("Should not drop block with any other value", canDrop(notEq(sharp, Binary.fromString("applause")), ccmd, dictionaries)); + assertFalse("Should not drop block with only the excluded value and null", + canDrop(notEq(sharpAndNull, Binary.fromString("sharp")), ccmd, dictionaries)); + + assertFalse("Should not drop block with any other value", + canDrop(notEq(sharpAndNull, Binary.fromString("applause")), ccmd, dictionaries)); + assertFalse("Should not drop block with a known value", canDrop(notEq(b, Binary.fromString("x")), ccmd, dictionaries));