Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion parquet-tools/src/main/java/parquet/tools/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ public static void die(String message, boolean usage, String name, Command comma
if (name != null && command != null) {
showUsage(name, command);
} else {
showUsage(name, command);
showUsage();
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ public final class Registry {
registry.put("schema", ShowSchemaCommand.class);
registry.put("meta", ShowMetaCommand.class);
registry.put("dump", DumpCommand.class);
registry.put("rowcount", RowCountCommand.class);
}

public static Map<String,Command> allCommands() {
Expand Down
121 changes: 121 additions & 0 deletions parquet-tools/src/main/java/parquet/tools/command/RowCountCommand.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/**
* Copyright 2013 ARRIS, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package parquet.tools.command;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileStatus;

import parquet.hadoop.Footer;
import parquet.hadoop.ParquetFileReader;
import parquet.hadoop.metadata.BlockMetaData;
import parquet.hadoop.metadata.ParquetMetadata;
import parquet.schema.MessageType;
import parquet.tools.Main;
import parquet.tools.util.PrettyPrintWriter;
import parquet.tools.util.PrettyPrintWriter.WhiteSpaceHandler;

import com.google.common.base.Joiner;
import java.util.List;
import java.util.ArrayList;

public class RowCountCommand extends ArgsOnlyCommand {
public static final String[] USAGE = new String[] {
"<input>",
"where <input> is the parquet file containing the row counts to show"
};

public static final Options OPTIONS;
static {
OPTIONS = new Options();
Option detail = OptionBuilder.withLongOpt("detailed")
.withDescription("Show the individual row counts.")
.create('d');
OPTIONS.addOption(detail);
}

public RowCountCommand() {
super(1, 1);
}

@Override
public String[] getUsageDescription() {
return USAGE;
}

@Override
public Options getOptions() {
return OPTIONS;
}

@Override
public void execute(CommandLine options) throws Exception {
super.execute(options);

String[] args = options.getArgs();
String input = args[0];

Configuration conf = new Configuration();
Path inputPath = new Path(input);
FileStatus inputFileStatus = inputPath.getFileSystem(conf).getFileStatus(inputPath);
List<Footer> footers = ParquetFileReader.readFooters(conf, inputFileStatus, false);

List< Long > rowCounts = new ArrayList< Long >();
for(Footer footer: footers){
processFooter(footer, rowCounts);
}
outputCounts(rowCounts, options.hasOption('d'));
}

private void processFooter(Footer footer, List<Long> rowCounts) throws Exception {
List< BlockMetaData > footerBlocks = footer.getParquetMetadata().getBlocks();
for (BlockMetaData bmeta: footerBlocks) {
processBlock(bmeta, rowCounts);
}
}

private void processBlock(BlockMetaData blockMeta, List<Long> rowCounts) throws Exception {
long rows = blockMeta.getRowCount();
rowCounts.add( new Long(rows) );
}

private void outputCounts( List< Long > rowCounts, boolean detailed ){
Long min = Long.MAX_VALUE;
Long max = Long.MIN_VALUE;
Long sum = 0L;
int count = 0;

for(Long rc: rowCounts){
min = ( rc < min ) ? rc : min;
max = ( rc > max ) ? rc : max;
sum += rc;
count++;
}
double avg = (count == 0 ) ? 0 : (sum / (double) count);

PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
.withAutoFlush()
.build();
out.printf("Row Group Stats: [ Total:%d, Min:%d, Avg:%.2f, Max:%d, NumRowGroups:%d ]\n", sum, min, avg, max, count);
if (detailed) {
out.printf("Row Counts: [ %s ]\n", Joiner.on(", ").join(rowCounts) );
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
import static parquet.format.converter.ParquetMetadataConverter.NO_FILTER;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
Expand All @@ -40,6 +43,20 @@ public class ShowMetaCommand extends ArgsOnlyCommand {
"where <input> is the parquet file to print to stdout"
};

public static final Options OPTIONS;
static {
OPTIONS = new Options();
Option human = OptionBuilder.withLongOpt("human-readable")
.withDescription("Show the byte counts in human readable form.")
.create('r');
Option summary = OptionBuilder.withLongOpt("summary")
.withDescription("Only show the summary per row group.")
.create('s');

OPTIONS.addOption(human);
OPTIONS.addOption(summary);
}

public ShowMetaCommand() {
super(1, 1);
}
Expand All @@ -49,13 +66,19 @@ public String[] getUsageDescription() {
return USAGE;
}

@Override
public Options getOptions() {
return OPTIONS;
}


@Override
public void execute(CommandLine options) throws Exception {
super.execute(options);

String[] args = options.getArgs();
String input = args[0];

Configuration conf = new Configuration();
Path inputPath = new Path(input);
FileStatus inputFileStatus = inputPath.getFileSystem(conf).getFileStatus(inputPath);
Expand All @@ -69,7 +92,7 @@ public void execute(CommandLine options) throws Exception {

for(Footer f: footers) {
out.format("file: %s%n" , f.getFile());
MetadataUtils.showDetails(out, f.getParquetMetadata());
MetadataUtils.showDetails(out, f.getParquetMetadata(), options.hasOption('r'), options.hasOption('s'));
out.flushColumns();
}
}
Expand Down
55 changes: 37 additions & 18 deletions parquet-tools/src/main/java/parquet/tools/util/MetadataUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,23 @@ public class MetadataUtils {
public static final double BAD_COMPRESSION_RATIO_CUTOFF = 0.97;
public static final double GOOD_COMPRESSION_RATIO_CUTOFF = 1.2;

public static void showDetails(PrettyPrintWriter out, ParquetMetadata meta) {
public static void showDetails(PrettyPrintWriter out, ParquetMetadata meta, boolean humanReadable, boolean summary) {
showDetails(out, meta.getFileMetaData());
out.println();

long i = 1;
for (BlockMetaData bmeta : meta.getBlocks()) {
out.println();
showDetails(out, bmeta, i++);
showDetails(out, bmeta, i++, humanReadable, summary);
if( !summary ){
out.println();
}
}
}

public static void showDetails(PrettyPrintWriter out, ParquetMetadata meta) {
showDetails(out, meta, false, false);
}

public static void showDetails(PrettyPrintWriter out, FileMetaData meta) {
out.format("creator: %s%n", meta.getCreatedBy());

Expand All @@ -76,20 +83,28 @@ public static void showDetails(PrettyPrintWriter out, FileMetaData meta) {
}

public static void showDetails(PrettyPrintWriter out, BlockMetaData meta) {
showDetails(out, meta, null);
showDetails(out, meta, null, false, false);
}

private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) {
long rows = meta.getRowCount();
long tbs = meta.getTotalByteSize();
private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num, boolean humanReadable, boolean summary) {
long rows = meta.getRowCount();
long offset = meta.getStartingPos();

out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset);
out.rule('-');
showDetails(out, meta.getColumns());
String tbs = (humanReadable)
? PrettyPrintWriter.humanReadableByteCount(meta.getTotalByteSize())
: Long.toString(meta.getTotalByteSize());

out.format("row group%s: RC:%d TS:%s OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset);
if( !summary ){
out.rule('-');
showDetails(out, meta.getColumns(), humanReadable);
}
}

public static void showDetails(PrettyPrintWriter out, List<ColumnChunkMetaData> ccmeta) {
showDetails(out, ccmeta, false);
}

public static void showDetails(PrettyPrintWriter out, List<ColumnChunkMetaData> ccmeta, boolean humanReadable) {
Map<String,Object> chunks = new LinkedHashMap<String,Object>();
for (ColumnChunkMetaData cmeta : ccmeta) {
String[] path = cmeta.getPath().toArray();
Expand All @@ -107,29 +122,29 @@ public static void showDetails(PrettyPrintWriter out, List<ColumnChunkMetaData>
current.put(path[path.length - 1], cmeta);
}

showColumnChunkDetails(out, chunks, 0);
showColumnChunkDetails(out, chunks, 0, humanReadable);
}

private static void showColumnChunkDetails(PrettyPrintWriter out, Map<String,Object> current, int depth) {
private static void showColumnChunkDetails(PrettyPrintWriter out, Map<String,Object> current, int depth, boolean humanReadable) {
for (Map.Entry<String,Object> entry : current.entrySet()) {
String name = Strings.repeat(".", depth) + entry.getKey();
Object value = entry.getValue();

if (value instanceof Map) {
out.println(name + ": ");
showColumnChunkDetails(out, (Map<String,Object>)value, depth + 1);
showColumnChunkDetails(out, (Map<String,Object>)value, depth + 1, humanReadable);
} else {
out.print(name + ": ");
showDetails(out, (ColumnChunkMetaData)value, false);
showDetails(out, (ColumnChunkMetaData)value, false, humanReadable);
}
}
}

public static void showDetails(PrettyPrintWriter out, ColumnChunkMetaData meta) {
showDetails(out, meta, true);
showDetails(out, meta, true, false);
}

private static void showDetails(PrettyPrintWriter out, ColumnChunkMetaData meta, boolean name) {
private static void showDetails(PrettyPrintWriter out, ColumnChunkMetaData meta, boolean name, boolean humanReadable) {
long doff = meta.getDictionaryPageOffset();
long foff = meta.getFirstDataPageOffset();
long tsize = meta.getTotalSize();
Expand All @@ -147,7 +162,11 @@ private static void showDetails(PrettyPrintWriter out, ColumnChunkMetaData meta,
out.format(" %s", meta.getCodec());
out.format(" DO:%d", doff);
out.format(" FPO:%d", foff);
out.format(" SZ:%d/%d/%.2f", tsize, usize, ratio);
if( humanReadable ) {
out.format(" SZ:%s/%s/%.2f", PrettyPrintWriter.humanReadableByteCount(tsize), PrettyPrintWriter.humanReadableByteCount(usize), ratio);
} else {
out.format(" SZ:%d/%d/%.2f", tsize, usize, ratio);
}
out.format(" VC:%d", count);
if (!encodings.isEmpty()) out.format(" ENC:%s", encodings);
out.println();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,15 @@ public void toString(StringBuilder builder) {
}
}

// Taken from: http://stackoverflow.com/questions/3758606/how-to-convert-byte-size-into-human-readable-format-in-java
public static String humanReadableByteCount(long bytes) {
int unit = 1000;
if (bytes < unit) return bytes + " B";
int exp = (int) (Math.log(bytes) / Math.log(unit));
char pre = "KMGTPE".charAt(exp-1) ;
return String.format("%.1f %cB", bytes / Math.pow(unit, exp), pre);
}

public static final class Span {
private String span;
private final String color;
Expand Down
25 changes: 25 additions & 0 deletions parquet-tools/src/main/scripts/parquet-rowcount
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env bash
#
# Copyright 2013 ARRIS, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# The name of the top-level script
TOPSCRIPT="parquet-tools"

# Determine the path to the script's directory
APPPATH=$( cd "$(dirname "$0")" ; pwd -P )

# Run the application
exec "${APPPATH}/${TOPSCRIPT}" rowcount "$@"