From 5fd0279776775230380f2dd01d252aac6d5ffb15 Mon Sep 17 00:00:00 2001 From: Swapnil Shinde Date: Wed, 22 Feb 2017 15:19:38 -0500 Subject: [PATCH 1/2] Parquet-196: parquet-tools command for row count & size --- .../parquet/tools/command/Registry.java | 2 + .../tools/command/RowCountCommand.java | 97 ++++++++++++ .../parquet/tools/command/SizeCommand.java | 140 ++++++++++++++++++ .../src/main/scripts/parquet-rowcount | 28 ++++ parquet-tools/src/main/scripts/parquet-size | 28 ++++ 5 files changed, 295 insertions(+) create mode 100644 parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java create mode 100644 parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java create mode 100644 parquet-tools/src/main/scripts/parquet-rowcount create mode 100644 parquet-tools/src/main/scripts/parquet-size diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java index 0e69f481c7..6df84be37a 100644 --- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java +++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java @@ -32,6 +32,8 @@ public final class Registry { registry.put("meta", ShowMetaCommand.class); registry.put("dump", DumpCommand.class); registry.put("merge", MergeCommand.class); + registry.put("rowcount", RowCountCommand.class); + registry.put("size", SizeCommand.class); } public static Map allCommands() { diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java new file mode 100644 index 0000000000..de84725623 --- /dev/null +++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.tools.command; + +import java.io.PrintWriter; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.Options; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; + +import org.apache.parquet.hadoop.Footer; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.tools.Main; + +public class RowCountCommand extends ArgsOnlyCommand { + private FileStatus[] inputFileStatuses; + private Configuration conf; + private Path inputPath; + private PrintWriter out; + public static final String[] USAGE = new String[] { + "", + "where is the parquet file to count rows to stdout" + }; + + public static final Options OPTIONS; + static { + OPTIONS = new Options(); + Option detailed = OptionBuilder.withLongOpt("detailed") + .withDescription("Detailed rowcount of each matching file") + .create('d'); + OPTIONS.addOption(detailed); + } + + public RowCountCommand() { + super(1, 1); + } + + @Override + public Options getOptions() { + return OPTIONS; + } + + @Override + public String[] getUsageDescription() { + return USAGE; + } + + @Override + public void execute(CommandLine options) throws Exception { + super.execute(options); + + String[] args = options.getArgs(); + String input = args[0]; + out = new PrintWriter(Main.out, true); + inputPath = new Path(input); + conf = new Configuration(); + inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath); + long rowCount = 0; + + for(FileStatus fs : inputFileStatuses){ + long fileRowCount=0; + for(Footer f : ParquetFileReader.readFooters(conf, fs, false)){ + for(BlockMetaData b : f.getParquetMetadata().getBlocks()){ + rowCount += b.getRowCount(); + fileRowCount += b.getRowCount(); + } + } + if(options.hasOption('d')){ + out.format("%s row count: %d\n", fs.getPath().getName(), fileRowCount); + } + } + + out.format("Total RowCount: %d", rowCount); + out.println(); + } +} diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java new file mode 100644 index 0000000000..6fed268127 --- /dev/null +++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.tools.command; + +import java.io.PrintWriter; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.Options; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; + +import org.apache.parquet.hadoop.Footer; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.tools.Main; + +public class SizeCommand extends ArgsOnlyCommand { + private FileStatus[] inputFileStatuses; + private Configuration conf; + private Path inputPath; + private PrintWriter out; + private static final double ONE_KB = 1024; + private static final double ONE_MB = ONE_KB * 1024; + private static final double ONE_GB = ONE_MB * 1024; + private static final double ONE_TB = ONE_GB * 1024; + private static final double ONE_PB = ONE_TB * 1024; + + public static final String[] USAGE = new String[] { + "", + "where is the parquet file to get size & human readable size to stdout" + }; + + public static final Options OPTIONS; + static { + OPTIONS = new Options(); + Option help = OptionBuilder.withLongOpt("pretty") + .withDescription("Pretty size") + .create('p'); + OPTIONS.addOption(help); + Option uncompressed = OptionBuilder.withLongOpt("uncompressed") + .withDescription("Uncompressed size") + .create('u'); + OPTIONS.addOption(uncompressed); + Option detailed = OptionBuilder.withLongOpt("detailed") + .withDescription("Detailed size of each matching file") + .create('d'); + OPTIONS.addOption(detailed); + } + + public SizeCommand() { + super(1, 1); + } + + @Override + public Options getOptions() { + return OPTIONS; + } + + @Override + public String[] getUsageDescription() { + return USAGE; + } + + @Override + public void execute(CommandLine options) throws Exception { + super.execute(options); + + String[] args = options.getArgs(); + String input = args[0]; + out = new PrintWriter(Main.out, true); + inputPath = new Path(input); + conf = new Configuration(); + inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath); + long size = 0; + for(FileStatus fs : inputFileStatuses){ + long fileSize = 0; + for(Footer f : ParquetFileReader.readFooters(conf, fs, false)){ + for(BlockMetaData b : f.getParquetMetadata().getBlocks()){ + size += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize()); + fileSize += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize()); + } + } + if(options.hasOption('d')){ + if(options.hasOption('p')){ + out.format("%s: %s\n", fs.getPath().getName(), getPrettySize(fileSize)); + } + else{ + out.format("%s: %d bytes\n", fs.getPath().getName(), fileSize); + } + } + } + + if(options.hasOption('p')){ + out.format("Total Size: %s", getPrettySize(size)); + } + else{ + out.format("Total Size: %d bytes", size); + } + out.println(); + } + + public String getPrettySize(long bytes){ + if (bytes/ONE_KB < 1){ + return String.format("%d", bytes) + " bytes"; + } + if (bytes/ONE_MB < 1){ + return String.format("%.3f", bytes/ONE_KB) + " KB"; + } + if (bytes/ONE_GB < 1){ + return String.format("%.3f", bytes/ONE_MB) + " MB"; + } + if (bytes/ONE_TB < 1){ + return String.format("%.3f", bytes/ONE_GB) + " GB"; + } + if (bytes/ONE_PB < 1){ + return String.format("%.3f", bytes/ONE_TB) + " TB"; + } + return String.format("%.3f", bytes/ONE_PB) + " PB"; + } +} diff --git a/parquet-tools/src/main/scripts/parquet-rowcount b/parquet-tools/src/main/scripts/parquet-rowcount new file mode 100644 index 0000000000..ab12e71f52 --- /dev/null +++ b/parquet-tools/src/main/scripts/parquet-rowcount @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# The name of the top-level script +TOPSCRIPT="parquet-tools" + +# Determine the path to the script's directory +APPPATH=$( cd "$(dirname "$0")" ; pwd -P ) + +# Run the application +exec "${APPPATH}/${TOPSCRIPT}" rowcount "$@" diff --git a/parquet-tools/src/main/scripts/parquet-size b/parquet-tools/src/main/scripts/parquet-size new file mode 100644 index 0000000000..c9048b0127 --- /dev/null +++ b/parquet-tools/src/main/scripts/parquet-size @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# The name of the top-level script +TOPSCRIPT="parquet-tools" + +# Determine the path to the script's directory +APPPATH=$( cd "$(dirname "$0")" ; pwd -P ) + +# Run the application +exec "${APPPATH}/${TOPSCRIPT}" size "$@" From 59a898089e0af99e280bb758c07dce53e8de7ade Mon Sep 17 00:00:00 2001 From: Swapnil Shinde Date: Mon, 27 Mar 2017 12:36:29 -0400 Subject: [PATCH 2/2] Spacing to conform java style (if/for) is fixed --- .../tools/command/RowCountCommand.java | 8 +++--- .../parquet/tools/command/SizeCommand.java | 26 +++++++++---------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java index de84725623..37d6079777 100644 --- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java +++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java @@ -78,15 +78,15 @@ public void execute(CommandLine options) throws Exception { inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath); long rowCount = 0; - for(FileStatus fs : inputFileStatuses){ + for (FileStatus fs : inputFileStatuses) { long fileRowCount=0; - for(Footer f : ParquetFileReader.readFooters(conf, fs, false)){ - for(BlockMetaData b : f.getParquetMetadata().getBlocks()){ + for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) { + for (BlockMetaData b : f.getParquetMetadata().getBlocks()) { rowCount += b.getRowCount(); fileRowCount += b.getRowCount(); } } - if(options.hasOption('d')){ + if (options.hasOption('d')) { out.format("%s row count: %d\n", fs.getPath().getName(), fileRowCount); } } diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java index 6fed268127..bcc67049de 100644 --- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java +++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java @@ -92,47 +92,47 @@ public void execute(CommandLine options) throws Exception { conf = new Configuration(); inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath); long size = 0; - for(FileStatus fs : inputFileStatuses){ + for (FileStatus fs : inputFileStatuses) { long fileSize = 0; - for(Footer f : ParquetFileReader.readFooters(conf, fs, false)){ - for(BlockMetaData b : f.getParquetMetadata().getBlocks()){ + for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) { + for (BlockMetaData b : f.getParquetMetadata().getBlocks()) { size += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize()); fileSize += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize()); } } - if(options.hasOption('d')){ - if(options.hasOption('p')){ + if (options.hasOption('d')) { + if (options.hasOption('p')) { out.format("%s: %s\n", fs.getPath().getName(), getPrettySize(fileSize)); } - else{ + else { out.format("%s: %d bytes\n", fs.getPath().getName(), fileSize); } } } - if(options.hasOption('p')){ + if (options.hasOption('p')) { out.format("Total Size: %s", getPrettySize(size)); } - else{ + else { out.format("Total Size: %d bytes", size); } out.println(); } public String getPrettySize(long bytes){ - if (bytes/ONE_KB < 1){ + if (bytes/ONE_KB < 1) { return String.format("%d", bytes) + " bytes"; } - if (bytes/ONE_MB < 1){ + if (bytes/ONE_MB < 1) { return String.format("%.3f", bytes/ONE_KB) + " KB"; } - if (bytes/ONE_GB < 1){ + if (bytes/ONE_GB < 1) { return String.format("%.3f", bytes/ONE_MB) + " MB"; } - if (bytes/ONE_TB < 1){ + if (bytes/ONE_TB < 1) { return String.format("%.3f", bytes/ONE_GB) + " GB"; } - if (bytes/ONE_PB < 1){ + if (bytes/ONE_PB < 1) { return String.format("%.3f", bytes/ONE_TB) + " TB"; } return String.format("%.3f", bytes/ONE_PB) + " PB";