diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ColumnSizeCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ColumnSizeCommand.java index 6de1c7badc..6694fb8ee5 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ColumnSizeCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ColumnSizeCommand.java @@ -24,8 +24,11 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Locale; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; @@ -56,6 +59,18 @@ public ColumnSizeCommand(Logger console) { required = false) List columns; + @Parameter( + names = {"-s", "--sort"}, + description = "Sort columns by size in descending order", + required = false) + boolean sortBySize = false; + + @Parameter( + names = {"-p", "--percentage"}, + description = "Print ratio as percentage instead of decimal", + required = false) + boolean printAsPercentage = false; + @Override @SuppressWarnings("unchecked") public int run() throws IOException { @@ -67,6 +82,10 @@ public int run() throws IOException { // If user defined columns, only print out size for those columns if (columns != null && !columns.isEmpty()) { + // Collect aggregated column data + Map aggregatedSizes = new LinkedHashMap<>(); + Map aggregatedRatios = new LinkedHashMap<>(); + for (String inputColumn : columns) { long size = 0; float ratio = 0; @@ -76,18 +95,52 @@ public int run() throws IOException { ratio += columnRatio.get(column); } } - console.info(inputColumn + "->" + " Size In Bytes: " + size + " Size In Ratio: " + ratio); + aggregatedSizes.put(inputColumn, size); + aggregatedRatios.put(inputColumn, ratio); + } + + // Sort if requested + List> entries = new ArrayList<>(aggregatedSizes.entrySet()); + if (sortBySize) { + entries.sort(Map.Entry.comparingByValue().reversed()); + } + + // Print results + for (Map.Entry entry : entries) { + String column = entry.getKey(); + long size = entry.getValue(); + float ratio = aggregatedRatios.get(column); + String ratioStr = formatRatio(ratio); + console.info(column + "->" + " Size In Bytes: " + size + " Size In Ratio: " + ratioStr); } } else { - for (String column : columnSizes.keySet()) { - console.info(column + "->" + " Size In Bytes: " + columnSizes.get(column) + " Size In Ratio: " - + columnRatio.get(column)); + // Sort if requested + List> entries = new ArrayList<>(columnSizes.entrySet()); + if (sortBySize) { + entries.sort(Map.Entry.comparingByValue().reversed()); + } + + // Print results + for (Map.Entry entry : entries) { + String column = entry.getKey(); + long size = entry.getValue(); + float ratio = columnRatio.get(column); + String ratioStr = formatRatio(ratio); + console.info(column + "->" + " Size In Bytes: " + size + " Size In Ratio: " + ratioStr); } } return 0; } + private String formatRatio(float ratio) { + if (printAsPercentage) { + return String.format(Locale.US, "%.4f%%", ratio * 100); + } else { + return String.valueOf(ratio); + } + } + @Override public List getExamples() { return Lists.newArrayList( @@ -96,7 +149,16 @@ public List getExamples() { "sample.parquet -c col_1", "sample.parquet --column col_2", "sample.parquet --columns col_1 col_2", - "sample.parquet --columns col_1 col_2.sub_col_a"); + "sample.parquet --columns col_1 col_2.sub_col_a", + "# Sort columns by size in descending order", + "sample.parquet --sort", + "sample.parquet -s", + "# Print ratio as percentage", + "sample.parquet --percentage", + "sample.parquet -p", + "# Combine sorting and percentage formatting", + "sample.parquet --sort --percentage", + "sample.parquet -s -p -c col_1 col_2"); } // Make it public to allow some automation tools to call it