From 37f0ea80b9ec23bb2035433c289a32cb25d1cc0f Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Wed, 17 Dec 2025 20:30:27 +0100 Subject: [PATCH 1/2] Allow reading dictionary encoded boolean I've observed some Parquet files in the wild that contain dictionary encoded boolean values, which is also wild. I don't think we want allow producing this, but I think it would be good to allow reading this. We don't judge. --- .../org/apache/parquet/column/Encoding.java | 3 + .../dictionary/PlainValuesDictionary.java | 43 ++++++++++++ .../values/dictionary/TestDictionary.java | 69 ++++++++++++++++++- 3 files changed, 114 insertions(+), 1 deletion(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java b/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java index cadf8f2e0e..874c99fded 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java @@ -40,6 +40,7 @@ import org.apache.parquet.column.values.deltastrings.DeltaByteArrayReader; import org.apache.parquet.column.values.dictionary.DictionaryValuesReader; import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainBinaryDictionary; +import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainBooleanDictionary; import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainDoubleDictionary; import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainFloatDictionary; import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainIntegerDictionary; @@ -102,6 +103,8 @@ public Dictionary initDictionary(ColumnDescriptor descriptor, DictionaryPage dic return new PlainIntegerDictionary(dictionaryPage); case FLOAT: return new PlainFloatDictionary(dictionaryPage); + case BOOLEAN: + return new PlainBooleanDictionary(dictionaryPage); default: throw new ParquetDecodingException( "Dictionary encoding not supported for type: " + descriptor.getType()); diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/PlainValuesDictionary.java b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/PlainValuesDictionary.java index 436bddd3c1..45d7390e19 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/PlainValuesDictionary.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/PlainValuesDictionary.java @@ -28,6 +28,7 @@ import org.apache.parquet.bytes.ByteBufferInputStream; import org.apache.parquet.column.Dictionary; import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.values.plain.BooleanPlainValuesReader; import org.apache.parquet.column.values.plain.PlainValuesReader.DoublePlainValuesReader; import org.apache.parquet.column.values.plain.PlainValuesReader.FloatPlainValuesReader; import org.apache.parquet.column.values.plain.PlainValuesReader.IntegerPlainValuesReader; @@ -300,4 +301,46 @@ public int getMaxId() { return floatDictionaryContent.length - 1; } } + + /** + * a simple implementation of dictionary for plain encoded boolean values + */ + public static class PlainBooleanDictionary extends PlainValuesDictionary { + + private final boolean[] boolDictionaryContent; + + /** + * @param dictionaryPage a dictionary page of encoded boolean values + * @throws IOException if there is an exception while decoding the dictionary page + */ + public PlainBooleanDictionary(DictionaryPage dictionaryPage) throws IOException { + super(dictionaryPage); + ByteBufferInputStream in = dictionaryPage.getBytes().toInputStream(); + boolDictionaryContent = new boolean[dictionaryPage.getDictionarySize()]; + BooleanPlainValuesReader boolReader = new BooleanPlainValuesReader(); + boolReader.initFromPage(dictionaryPage.getDictionarySize(), in); + for (int i = 0; i < boolDictionaryContent.length; i++) { + boolDictionaryContent[i] = boolReader.readBoolean(); + } + } + + @Override + public boolean decodeToBoolean(int id) { + return boolDictionaryContent[id]; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("PlainIntegerDictionary {\n"); + for (int i = 0; i < boolDictionaryContent.length; i++) { + sb.append(i).append(" => ").append(boolDictionaryContent[i]).append("\n"); + } + return sb.append("}").toString(); + } + + @Override + public int getMaxId() { + return boolDictionaryContent.length - 1; + } + } } diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java index 6f7116bc36..fc6fae02ef 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java @@ -24,7 +24,7 @@ import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; -import static org.junit.Assert.assertEquals; +import static org.junit.Assert.*; import java.io.IOException; import java.nio.ByteBuffer; @@ -44,6 +44,7 @@ import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter; import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter; import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter; +import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainBooleanDictionary; import org.apache.parquet.column.values.fallback.FallbackValuesWriter; import org.apache.parquet.column.values.plain.BinaryPlainValuesReader; import org.apache.parquet.column.values.plain.PlainValuesReader; @@ -678,6 +679,72 @@ public void testZeroValues() throws IOException { } } + @Test + public void testBooleanDictionary() throws IOException { + // Create a dictionary page with boolean values (false, true) + // Bit-packed: bit 0 = false (0), bit 1 = true (1) => byte = 0b00000010 = 0x02 + BytesInput bytes = BytesInput.from(new byte[] {0x02}); + DictionaryPage dictionaryPage = new DictionaryPage(bytes, 2, PLAIN); + + PlainBooleanDictionary dictionary = new PlainBooleanDictionary(dictionaryPage); + + // Verify dictionary decoding + assertFalse(dictionary.decodeToBoolean(0)); + assertTrue(dictionary.decodeToBoolean(1)); + assertEquals(1, dictionary.getMaxId()); + } + + @Test + public void testBooleanDictionarySingleValue() throws IOException { + // Test dictionary with only true value + // Bit-packed: bit 0 = true (1) => byte = 0b00000001 = 0x01 + BytesInput bytesTrue = BytesInput.from(new byte[] {0x01}); + DictionaryPage dictionaryPageTrue = new DictionaryPage(bytesTrue, 1, PLAIN); + + PlainBooleanDictionary dictionaryTrue = new PlainBooleanDictionary(dictionaryPageTrue); + + assertTrue(dictionaryTrue.decodeToBoolean(0)); + assertEquals(0, dictionaryTrue.getMaxId()); + + // Test dictionary with only false value + // Bit-packed: bit 0 = false (0) => byte = 0b00000000 = 0x00 + BytesInput bytesFalse = BytesInput.from(new byte[] {0x00}); + DictionaryPage dictionaryPageFalse = new DictionaryPage(bytesFalse, 1, PLAIN); + + PlainBooleanDictionary dictionaryFalse = new PlainBooleanDictionary(dictionaryPageFalse); + + assertFalse(dictionaryFalse.decodeToBoolean(0)); + assertEquals(0, dictionaryFalse.getMaxId()); + } + + @Test + public void testBooleanDictionaryToString() throws IOException { + // Bit-packed: bit 0 = false (0), bit 1 = true (1) => byte = 0b00000010 = 0x02 + BytesInput bytes = BytesInput.from(new byte[] {0x02}); + DictionaryPage dictionaryPage = new DictionaryPage(bytes, 2, PLAIN); + + PlainBooleanDictionary dictionary = new PlainBooleanDictionary(dictionaryPage); + + String str = dictionary.toString(); + Assert.assertTrue(str.contains("PlainIntegerDictionary")); + Assert.assertTrue(str.contains("0 => false")); + Assert.assertTrue(str.contains("1 => true")); + } + + @Test + public void testBooleanDictionaryWithDictionaryEncoding() throws IOException { + // Test with PLAIN_DICTIONARY encoding (both PLAIN and PLAIN_DICTIONARY should work) + // Bit-packed: bit 0 = true (1), bit 1 = false (0) => byte = 0b00000001 = 0x01 + BytesInput bytes = BytesInput.from(new byte[] {0x01}); + DictionaryPage dictionaryPage = new DictionaryPage(bytes, 2, PLAIN_DICTIONARY); + + PlainBooleanDictionary dictionary = new PlainBooleanDictionary(dictionaryPage); + + assertEquals(true, dictionary.decodeToBoolean(0)); + assertEquals(false, dictionary.decodeToBoolean(1)); + assertEquals(1, dictionary.getMaxId()); + } + private DictionaryValuesReader initDicReader(ValuesWriter cw, PrimitiveTypeName type) throws IOException { final DictionaryPage dictionaryPage = cw.toDictPageAndClose().copy(); final ColumnDescriptor descriptor = new ColumnDescriptor(new String[] {"foo"}, type, 0, 0); From 26f30d09dd5d7bbb52454fc738558a8d31a79bf7 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Thu, 18 Dec 2025 17:35:08 +0100 Subject: [PATCH 2/2] Thanks Gang --- .../column/values/dictionary/PlainValuesDictionary.java | 2 +- .../parquet/column/values/dictionary/TestDictionary.java | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/PlainValuesDictionary.java b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/PlainValuesDictionary.java index 45d7390e19..468c7d110f 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/PlainValuesDictionary.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/PlainValuesDictionary.java @@ -331,7 +331,7 @@ public boolean decodeToBoolean(int id) { @Override public String toString() { - StringBuilder sb = new StringBuilder("PlainIntegerDictionary {\n"); + StringBuilder sb = new StringBuilder("PlainBooleanDictionary {\n"); for (int i = 0; i < boolDictionaryContent.length; i++) { sb.append(i).append(" => ").append(boolDictionaryContent[i]).append("\n"); } diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java index fc6fae02ef..a91f807e73 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java @@ -24,7 +24,9 @@ import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; -import static org.junit.Assert.*; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; import java.io.IOException; import java.nio.ByteBuffer; @@ -726,7 +728,7 @@ public void testBooleanDictionaryToString() throws IOException { PlainBooleanDictionary dictionary = new PlainBooleanDictionary(dictionaryPage); String str = dictionary.toString(); - Assert.assertTrue(str.contains("PlainIntegerDictionary")); + Assert.assertTrue(str.contains("PlainBooleanDictionary")); Assert.assertTrue(str.contains("0 => false")); Assert.assertTrue(str.contains("1 => true")); }