From 09063f4a48c6d1441787edd8ec7984d136f7b5ac Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Wed, 12 Sep 2018 14:28:40 +0200 Subject: [PATCH 1/2] PARQUET-1365: Don't write page level statistics --- .../format/converter/ParquetMetadataConverter.java | 13 +------------ .../hadoop/TestColumnChunkPageWriteStore.java | 1 - 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 58ae5039c4..dadd7c3c12 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -1334,7 +1334,6 @@ public void writeDataPageHeader( writePageHeader(newDataPageHeader(uncompressedSize, compressedSize, valueCount, - new org.apache.parquet.column.statistics.BooleanStatistics(), rlEncoding, dlEncoding, valuesEncoding), to); @@ -1350,7 +1349,7 @@ public void writeDataPageHeader( org.apache.parquet.column.Encoding valuesEncoding, OutputStream to) throws IOException { writePageHeader( - newDataPageHeader(uncompressedSize, compressedSize, valueCount, statistics, + newDataPageHeader(uncompressedSize, compressedSize, valueCount, rlEncoding, dlEncoding, valuesEncoding), to); } @@ -1358,7 +1357,6 @@ public void writeDataPageHeader( private PageHeader newDataPageHeader( int uncompressedSize, int compressedSize, int valueCount, - org.apache.parquet.column.statistics.Statistics statistics, org.apache.parquet.column.Encoding rlEncoding, org.apache.parquet.column.Encoding dlEncoding, org.apache.parquet.column.Encoding valuesEncoding) { @@ -1369,9 +1367,6 @@ private PageHeader newDataPageHeader( getEncoding(valuesEncoding), getEncoding(dlEncoding), getEncoding(rlEncoding))); - if (!statistics.isEmpty()) { - pageHeader.getData_page_header().setStatistics(toParquetStatistics(statistics)); - } return pageHeader; } @@ -1386,7 +1381,6 @@ public void writeDataPageV2Header( newDataPageV2Header( uncompressedSize, compressedSize, valueCount, nullCount, rowCount, - statistics, dataEncoding, rlByteLength, dlByteLength), to); } @@ -1394,7 +1388,6 @@ public void writeDataPageV2Header( private PageHeader newDataPageV2Header( int uncompressedSize, int compressedSize, int valueCount, int nullCount, int rowCount, - org.apache.parquet.column.statistics.Statistics statistics, org.apache.parquet.column.Encoding dataEncoding, int rlByteLength, int dlByteLength) { // TODO: pageHeader.crc = ...; @@ -1402,10 +1395,6 @@ private PageHeader newDataPageV2Header( valueCount, nullCount, rowCount, getEncoding(dataEncoding), dlByteLength, rlByteLength); - if (!statistics.isEmpty()) { - dataPageHeaderV2.setStatistics( - toParquetStatistics(statistics)); - } PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE_V2, uncompressedSize, compressedSize); pageHeader.setData_page_header_v2(dataPageHeaderV2); return pageHeader; diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java index 9a27defe15..c353ee3fe7 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java @@ -189,7 +189,6 @@ public void test() throws Exception { assertEquals(r, intValue(page.getRepetitionLevels())); assertEquals(dataEncoding, page.getDataEncoding()); assertEquals(v, intValue(page.getData())); - assertEquals(statistics.toString(), page.getStatistics().toString()); // Checking column/offset indexes for the one page ColumnChunkMetaData column = footer.getBlocks().get(0).getColumns().get(0); From 1a736085b931b89290d8e59a84ac46495ca64c90 Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Fri, 19 Oct 2018 08:40:34 +0200 Subject: [PATCH 2/2] PARQUET-1365: Remove statistics from methods --- .../converter/ParquetMetadataConverter.java | 34 +++++++++++++++++++ .../hadoop/ColumnChunkPageWriteStore.java | 4 +-- .../parquet/hadoop/ParquetFileWriter.java | 5 ++- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index dadd7c3c12..b9c8996f0f 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -1339,6 +1339,8 @@ public void writeDataPageHeader( valuesEncoding), to); } + // Statistics are no longer saved in page headers + @Deprecated public void writeDataPageHeader( int uncompressedSize, int compressedSize, @@ -1370,6 +1372,8 @@ private PageHeader newDataPageHeader( return pageHeader; } + // Statistics are no longer saved in page headers + @Deprecated public void writeDataPageV2Header( int uncompressedSize, int compressedSize, int valueCount, int nullCount, int rowCount, @@ -1385,6 +1389,36 @@ public void writeDataPageV2Header( rlByteLength, dlByteLength), to); } + public void writeDataPageV1Header( + int uncompressedSize, + int compressedSize, + int valueCount, + org.apache.parquet.column.Encoding rlEncoding, + org.apache.parquet.column.Encoding dlEncoding, + org.apache.parquet.column.Encoding valuesEncoding, + OutputStream to) throws IOException { + writePageHeader(newDataPageHeader(uncompressedSize, + compressedSize, + valueCount, + rlEncoding, + dlEncoding, + valuesEncoding), to); + } + + public void writeDataPageV2Header( + int uncompressedSize, int compressedSize, + int valueCount, int nullCount, int rowCount, + org.apache.parquet.column.Encoding dataEncoding, + int rlByteLength, int dlByteLength, + OutputStream to) throws IOException { + writePageHeader( + newDataPageV2Header( + uncompressedSize, compressedSize, + valueCount, nullCount, rowCount, + dataEncoding, + rlByteLength, dlByteLength), to); + } + private PageHeader newDataPageV2Header( int uncompressedSize, int compressedSize, int valueCount, int nullCount, int rowCount, diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java index 85bdbdbd9b..f87630bf24 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java @@ -119,11 +119,10 @@ public void writePage(BytesInput bytes, + compressedSize); } tempOutputStream.reset(); - parquetMetadataConverter.writeDataPageHeader( + parquetMetadataConverter.writeDataPageV1Header( (int)uncompressedSize, (int)compressedSize, valueCount, - statistics, rlEncoding, dlEncoding, valuesEncoding, @@ -171,7 +170,6 @@ public void writePageV2( parquetMetadataConverter.writeDataPageV2Header( uncompressedSize, compressedSize, valueCount, nullCount, rowCount, - statistics, dataEncoding, rlByteLength, dlByteLength, diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java index a8cd686022..20efe47573 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java @@ -433,7 +433,7 @@ public void writeDataPage( long beforeHeader = out.getPos(); LOG.debug("{}: write data page: {} values", beforeHeader, valueCount); int compressedPageSize = (int)bytes.size(); - metadataConverter.writeDataPageHeader( + metadataConverter.writeDataPageV1Header( uncompressedPageSize, compressedPageSize, valueCount, rlEncoding, @@ -518,10 +518,9 @@ private void innerWriteDataPage( } LOG.debug("{}: write data page: {} values", beforeHeader, valueCount); int compressedPageSize = (int) bytes.size(); - metadataConverter.writeDataPageHeader( + metadataConverter.writeDataPageV1Header( uncompressedPageSize, compressedPageSize, valueCount, - statistics, rlEncoding, dlEncoding, valuesEncoding,