From a0983cca60746f85f4ac6fd94a1c4c30bf2614a2 Mon Sep 17 00:00:00 2001 From: arnavb Date: Wed, 27 Aug 2025 07:21:53 +0000 Subject: [PATCH 1/6] update --- parquet-cli/README.md | 22 +++++++ .../cli/commands/ConvertCSVCommand.java | 25 +++++++- .../parquet/cli/commands/ConvertCommand.java | 25 +++++++- .../cli/commands/ConvertCSVCommandTest.java | 16 +++++ .../cli/commands/ConvertCommandTest.java | 59 +++++++++++++++++++ 5 files changed, 145 insertions(+), 2 deletions(-) diff --git a/parquet-cli/README.md b/parquet-cli/README.md index 9b75efaa1a..090fa2abb3 100644 --- a/parquet-cli/README.md +++ b/parquet-cli/README.md @@ -134,3 +134,25 @@ Usage: parquet [options] [command] [command options] See 'parquet help ' for more information on a specific command. ``` +### Configuration Options + +The `convert` and `convert-csv` commands support a generic configuration mechanism: + +- `--conf` or `--property`: Set any configuration property in format `key=value`. Can be specified multiple times. + +This allows you to configure any Avro or Parquet setting without needing to modify source code or rebuild the project. + +Examples: +```bash +# Enable UUID support +parquet convert input.avro -o output.parquet --conf parquet.avro.write-parquet-uuid=true + +# Use new 3-level list structure +parquet convert input.avro -o output.parquet --conf parquet.avro.write-old-list-structure=false + +# Convert CSV with multiple options +parquet convert-csv input.csv -o output.parquet --schema schema.avsc --conf parquet.avro.write-parquet-uuid=true --conf parquet.avro.write-old-list-structure=false + +# Set any other configuration properties +parquet convert input.avro -o output.parquet --conf parquet.avro.add-list-element-records=false --conf parquet.avro.write.data.supplier=org.apache.parquet.avro.GenericDataSupplier +``` diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java index d7249eba65..6d1e64cf91 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java @@ -45,6 +45,7 @@ import org.apache.parquet.cli.util.Schemas; import org.apache.parquet.hadoop.ParquetFileWriter; import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.hadoop.conf.Configuration; import org.slf4j.Logger; @Parameters(commandDescription = "Create a file from CSV data") @@ -117,6 +118,11 @@ public ConvertCSVCommand(Logger console) { description = "Remove any data already in the target view or dataset") boolean overwrite = false; + @Parameter( + names = {"--conf", "--property"}, + description = "Set a configuration property (format: key=value). Can be specified multiple times.") + List confProperties; + @Override @SuppressWarnings("unchecked") public int run() throws IOException { @@ -168,6 +174,23 @@ public int run() throws IOException { } } + // Create a configuration and apply custom properties + Configuration conf = new Configuration(getConf()); + + // Apply custom configuration properties + if (confProperties != null) { + for (String prop : confProperties) { + String[] parts = prop.split("=", 2); + if (parts.length != 2) { + throw new IllegalArgumentException("Configuration property must be in format key=value: " + prop); + } + String key = parts[0].trim(); + String value = parts[1].trim(); + conf.set(key, value); + console.debug("Set configuration property: {}={}", key, value); + } + } + try (ParquetWriter writer = AvroParquetWriter.builder(qualifiedPath(outputPath)) .withWriterVersion(v2 ? PARQUET_2_0 : PARQUET_1_0) .withWriteMode(overwrite ? ParquetFileWriter.Mode.OVERWRITE : ParquetFileWriter.Mode.CREATE) @@ -177,7 +200,7 @@ public int run() throws IOException { .withPageSize(pageSize) .withRowGroupSize(rowGroupSize) .withDataModel(GenericData.get()) - .withConf(getConf()) + .withConf(conf) .withSchema(csvSchema) .build()) { for (String target : targets) { diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java index c92527c5d0..0b4dcb96a0 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java @@ -43,6 +43,7 @@ import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.slf4j.Logger; +import org.apache.hadoop.conf.Configuration; @Parameters(commandDescription = "Create a Parquet file from a data file") public class ConvertCommand extends BaseCommand { @@ -95,6 +96,11 @@ public ConvertCommand(Logger console) { @Parameter(names = "--dictionary-size", description = "Max dictionary page size") int dictionaryPageSize = ParquetWriter.DEFAULT_PAGE_SIZE; + @Parameter( + names = {"--conf", "--property"}, + description = "Set a configuration property (format: key=value). Can be specified multiple times.") + List confProperties; + @Override @SuppressWarnings("unchecked") public int run() throws IOException { @@ -119,13 +125,30 @@ public int run() throws IOException { outFS.delete(outPath); } + // Create a configuration and apply custom properties + Configuration conf = new Configuration(getConf()); + + // Apply custom configuration properties + if (confProperties != null) { + for (String prop : confProperties) { + String[] parts = prop.split("=", 2); + if (parts.length != 2) { + throw new IllegalArgumentException("Configuration property must be in format key=value: " + prop); + } + String key = parts[0].trim(); + String value = parts[1].trim(); + conf.set(key, value); + console.debug("Set configuration property: {}={}", key, value); + } + } + Iterable reader = openDataFile(source, projection); boolean threw = true; long count = 0; try { try (ParquetWriter writer = AvroParquetWriter.builder(qualifiedPath(outputPath)) .withWriterVersion(v2 ? PARQUET_2_0 : PARQUET_1_0) - .withConf(getConf()) + .withConf(conf) .withCompressionCodec(codec) .withRowGroupSize(rowGroupSize) .withDictionaryPageSize(dictionaryPageSize < 64 ? 64 : dictionaryPageSize) diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCSVCommandTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCSVCommandTest.java index 05053c097d..b0e8ef29b6 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCSVCommandTest.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCSVCommandTest.java @@ -61,4 +61,20 @@ public void testConvertCSVCommandWithDifferentSchemas() throws IOException { command.setConf(new Configuration()); command.run(); } + + @Test + public void testConvertCSVCommandWithGenericConf() throws IOException { + File file = csvFile(); + ConvertCSVCommand command = new ConvertCSVCommand(createLogger()); + command.targets = Arrays.asList(file.getAbsolutePath()); + File output = new File(getTempFolder(), getClass().getSimpleName() + "_with_generic_conf.parquet"); + command.outputPath = output.getAbsolutePath(); + command.confProperties = Arrays.asList( + "parquet.avro.write-parquet-uuid=true", + "parquet.avro.write-old-list-structure=false" + ); + command.setConf(new Configuration()); + Assert.assertEquals(0, command.run()); + Assert.assertTrue(output.exists()); + } } diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCommandTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCommandTest.java index 4870c48b49..dc208e2e89 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCommandTest.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCommandTest.java @@ -37,4 +37,63 @@ public void testConvertCommand() throws IOException { Assert.assertEquals(0, command.run()); Assert.assertTrue(output.exists()); } + + @Test + public void testConvertCommandWithGenericConf() throws IOException { + File file = toAvro(parquetFile()); + ConvertCommand command = new ConvertCommand(createLogger()); + command.targets = Arrays.asList(file.getAbsolutePath()); + File output = new File(getTempFolder(), "converted_with_generic_conf.parquet"); + command.outputPath = output.getAbsolutePath(); + command.confProperties = Arrays.asList( + "parquet.avro.write-parquet-uuid=true", + "parquet.avro.write-old-list-structure=false", + "test.property=test.value" + ); + command.setConf(new Configuration()); + + Assert.assertEquals(0, command.run()); + Assert.assertTrue(output.exists()); + } + + @Test + public void testConvertCommandConfigurationValidation() throws IOException { + File file = toAvro(parquetFile()); + ConvertCommand command = new ConvertCommand(createLogger()); + command.targets = Arrays.asList(file.getAbsolutePath()); + File output = new File(getTempFolder(), "converted_with_config_validation.parquet"); + command.outputPath = output.getAbsolutePath(); + + command.confProperties = Arrays.asList( + "parquet.avro.write-parquet-uuid=true", + "parquet.avro.write-old-list-structure=false" + ); + + command.setConf(new Configuration()); + + Assert.assertEquals(0, command.run()); + Assert.assertTrue(output.exists()); + + File output2 = new File(getTempFolder(), "converted_with_config_validation2.parquet"); + command.outputPath = output2.getAbsolutePath(); + command.confProperties = Arrays.asList( + "parquet.avro.write-parquet-uuid=false", + "parquet.avro.write-old-list-structure=true" + ); + + Assert.assertEquals(0, command.run()); + Assert.assertTrue(output2.exists()); + } + + @Test(expected = IllegalArgumentException.class) + public void testConvertCommandWithInvalidConf() throws IOException { + File file = toAvro(parquetFile()); + ConvertCommand command = new ConvertCommand(createLogger()); + command.targets = Arrays.asList(file.getAbsolutePath()); + File output = new File(getTempFolder(), "converted_with_invalid_conf.parquet"); + command.outputPath = output.getAbsolutePath(); + command.confProperties = Arrays.asList("invalid-property-format"); + command.setConf(new Configuration()); + command.run(); + } } From e7827362b5886f4560730736fae491c73096aa90 Mon Sep 17 00:00:00 2001 From: arnavb Date: Wed, 27 Aug 2025 07:28:32 +0000 Subject: [PATCH 2/6] update --- parquet-cli/README.md | 10 +--------- .../parquet/cli/commands/ConvertCSVCommand.java | 2 -- .../parquet/cli/commands/ConvertCommand.java | 4 +--- .../cli/commands/ConvertCSVCommandTest.java | 6 ++---- .../parquet/cli/commands/ConvertCommandTest.java | 15 +++++---------- 5 files changed, 9 insertions(+), 28 deletions(-) diff --git a/parquet-cli/README.md b/parquet-cli/README.md index 090fa2abb3..c7b3540a4b 100644 --- a/parquet-cli/README.md +++ b/parquet-cli/README.md @@ -136,23 +136,15 @@ Usage: parquet [options] [command] [command options] ### Configuration Options -The `convert` and `convert-csv` commands support a generic configuration mechanism: - - `--conf` or `--property`: Set any configuration property in format `key=value`. Can be specified multiple times. -This allows you to configure any Avro or Parquet setting without needing to modify source code or rebuild the project. - Examples: ```bash -# Enable UUID support parquet convert input.avro -o output.parquet --conf parquet.avro.write-parquet-uuid=true -# Use new 3-level list structure parquet convert input.avro -o output.parquet --conf parquet.avro.write-old-list-structure=false -# Convert CSV with multiple options +# Multiple options parquet convert-csv input.csv -o output.parquet --schema schema.avsc --conf parquet.avro.write-parquet-uuid=true --conf parquet.avro.write-old-list-structure=false -# Set any other configuration properties -parquet convert input.avro -o output.parquet --conf parquet.avro.add-list-element-records=false --conf parquet.avro.write.data.supplier=org.apache.parquet.avro.GenericDataSupplier ``` diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java index 6d1e64cf91..cc779a745f 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java @@ -174,10 +174,8 @@ public int run() throws IOException { } } - // Create a configuration and apply custom properties Configuration conf = new Configuration(getConf()); - // Apply custom configuration properties if (confProperties != null) { for (String prop : confProperties) { String[] parts = prop.split("=", 2); diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java index 0b4dcb96a0..1cf48af5f7 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java @@ -34,6 +34,7 @@ import java.util.List; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.parquet.avro.AvroParquetWriter; @@ -43,7 +44,6 @@ import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.slf4j.Logger; -import org.apache.hadoop.conf.Configuration; @Parameters(commandDescription = "Create a Parquet file from a data file") public class ConvertCommand extends BaseCommand { @@ -125,10 +125,8 @@ public int run() throws IOException { outFS.delete(outPath); } - // Create a configuration and apply custom properties Configuration conf = new Configuration(getConf()); - // Apply custom configuration properties if (confProperties != null) { for (String prop : confProperties) { String[] parts = prop.split("=", 2); diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCSVCommandTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCSVCommandTest.java index b0e8ef29b6..e6901464e9 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCSVCommandTest.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCSVCommandTest.java @@ -69,10 +69,8 @@ public void testConvertCSVCommandWithGenericConf() throws IOException { command.targets = Arrays.asList(file.getAbsolutePath()); File output = new File(getTempFolder(), getClass().getSimpleName() + "_with_generic_conf.parquet"); command.outputPath = output.getAbsolutePath(); - command.confProperties = Arrays.asList( - "parquet.avro.write-parquet-uuid=true", - "parquet.avro.write-old-list-structure=false" - ); + command.confProperties = + Arrays.asList("parquet.avro.write-parquet-uuid=true", "parquet.avro.write-old-list-structure=false"); command.setConf(new Configuration()); Assert.assertEquals(0, command.run()); Assert.assertTrue(output.exists()); diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCommandTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCommandTest.java index dc208e2e89..f12b3c684e 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCommandTest.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCommandTest.java @@ -48,8 +48,7 @@ public void testConvertCommandWithGenericConf() throws IOException { command.confProperties = Arrays.asList( "parquet.avro.write-parquet-uuid=true", "parquet.avro.write-old-list-structure=false", - "test.property=test.value" - ); + "test.property=test.value"); command.setConf(new Configuration()); Assert.assertEquals(0, command.run()); @@ -64,10 +63,8 @@ public void testConvertCommandConfigurationValidation() throws IOException { File output = new File(getTempFolder(), "converted_with_config_validation.parquet"); command.outputPath = output.getAbsolutePath(); - command.confProperties = Arrays.asList( - "parquet.avro.write-parquet-uuid=true", - "parquet.avro.write-old-list-structure=false" - ); + command.confProperties = + Arrays.asList("parquet.avro.write-parquet-uuid=true", "parquet.avro.write-old-list-structure=false"); command.setConf(new Configuration()); @@ -76,10 +73,8 @@ public void testConvertCommandConfigurationValidation() throws IOException { File output2 = new File(getTempFolder(), "converted_with_config_validation2.parquet"); command.outputPath = output2.getAbsolutePath(); - command.confProperties = Arrays.asList( - "parquet.avro.write-parquet-uuid=false", - "parquet.avro.write-old-list-structure=true" - ); + command.confProperties = + Arrays.asList("parquet.avro.write-parquet-uuid=false", "parquet.avro.write-old-list-structure=true"); Assert.assertEquals(0, command.run()); Assert.assertTrue(output2.exists()); From d2e5152a9d09d557c3e6f3dfce6f46e4e3cc006f Mon Sep 17 00:00:00 2001 From: arnavb Date: Wed, 27 Aug 2025 09:54:07 +0000 Subject: [PATCH 3/6] update --- .../java/org/apache/parquet/cli/commands/ConvertCSVCommand.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java index cc779a745f..5fba960f36 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java @@ -36,6 +36,7 @@ import org.apache.avro.Schema; import org.apache.avro.SchemaNormalization; import org.apache.avro.generic.GenericData; +import org.apache.hadoop.conf.Configuration; import org.apache.parquet.avro.AvroParquetWriter; import org.apache.parquet.cli.BaseCommand; import org.apache.parquet.cli.csv.AvroCSV; @@ -45,7 +46,6 @@ import org.apache.parquet.cli.util.Schemas; import org.apache.parquet.hadoop.ParquetFileWriter; import org.apache.parquet.hadoop.ParquetWriter; -import org.apache.hadoop.conf.Configuration; import org.slf4j.Logger; @Parameters(commandDescription = "Create a file from CSV data") From cd6d5598e9eb12bfebd5897c4ca0cfda946ac3b0 Mon Sep 17 00:00:00 2001 From: arnavb Date: Thu, 28 Aug 2025 11:24:16 +0000 Subject: [PATCH 4/6] update --- .../java/org/apache/parquet/cli/Main.java | 28 +++++++++++++-- .../cli/commands/ConvertCSVCommand.java | 22 +----------- .../parquet/cli/commands/ConvertCommand.java | 22 +----------- .../cli/commands/ConvertCSVCommandTest.java | 7 ++-- .../cli/commands/ConvertCommandTest.java | 36 +++++++------------ 5 files changed, 44 insertions(+), 71 deletions(-) diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java index 62940054e6..57b32d670d 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java @@ -26,6 +26,7 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableSet; import java.util.Set; +import java.util.List; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; @@ -67,6 +68,11 @@ public class Main extends Configured implements Tool { description = "Print extra debugging information") private boolean debug = false; + @Parameter( + names = {"--conf", "--property"}, + description = "Set a configuration property (format: key=value). Can be specified multiple times.") + private List confProperties; + @VisibleForTesting @Parameter(names = "--dollar-zero", description = "A way for the runtime path to be passed in", hidden = true) String programName = DEFAULT_PROGRAM_NAME; @@ -162,10 +168,25 @@ public int run(String[] args) throws Exception { return 1; } - try { - if (command instanceof Configurable) { - ((Configurable) command).setConf(getConf()); + // Note to developer: This is a generic way to apply configs to given command. + // If the command does not support the configs, it would simply be ignored. + if (command instanceof Configurable) { + Configuration merged = new Configuration(getConf()); + if (confProperties != null) { + for (String prop : confProperties) { + String[] parts = prop.split("=", 2); + if (parts.length != 2) { + throw new IllegalArgumentException( + "Configuration property must be in format key=value: " + prop); + } + merged.set(parts[0].trim(), parts[1].trim()); + console.debug("Set configuration property: {}={}", parts[0].trim(), parts[1].trim()); + } } + ((Configurable) command).setConf(merged); + } + + try { return command.run(); } catch (IllegalArgumentException e) { if (debug) { @@ -185,6 +206,7 @@ public int run(String[] args) throws Exception { console.error("Unknown error", e); return 1; } + } public static void main(String[] args) throws Exception { diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java index 5fba960f36..4de811a0e0 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java @@ -118,11 +118,6 @@ public ConvertCSVCommand(Logger console) { description = "Remove any data already in the target view or dataset") boolean overwrite = false; - @Parameter( - names = {"--conf", "--property"}, - description = "Set a configuration property (format: key=value). Can be specified multiple times.") - List confProperties; - @Override @SuppressWarnings("unchecked") public int run() throws IOException { @@ -174,21 +169,6 @@ public int run() throws IOException { } } - Configuration conf = new Configuration(getConf()); - - if (confProperties != null) { - for (String prop : confProperties) { - String[] parts = prop.split("=", 2); - if (parts.length != 2) { - throw new IllegalArgumentException("Configuration property must be in format key=value: " + prop); - } - String key = parts[0].trim(); - String value = parts[1].trim(); - conf.set(key, value); - console.debug("Set configuration property: {}={}", key, value); - } - } - try (ParquetWriter writer = AvroParquetWriter.builder(qualifiedPath(outputPath)) .withWriterVersion(v2 ? PARQUET_2_0 : PARQUET_1_0) .withWriteMode(overwrite ? ParquetFileWriter.Mode.OVERWRITE : ParquetFileWriter.Mode.CREATE) @@ -198,7 +178,7 @@ public int run() throws IOException { .withPageSize(pageSize) .withRowGroupSize(rowGroupSize) .withDataModel(GenericData.get()) - .withConf(conf) + .withConf(getConf()) .withSchema(csvSchema) .build()) { for (String target : targets) { diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java index 1cf48af5f7..54c21edfb9 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java @@ -96,11 +96,6 @@ public ConvertCommand(Logger console) { @Parameter(names = "--dictionary-size", description = "Max dictionary page size") int dictionaryPageSize = ParquetWriter.DEFAULT_PAGE_SIZE; - @Parameter( - names = {"--conf", "--property"}, - description = "Set a configuration property (format: key=value). Can be specified multiple times.") - List confProperties; - @Override @SuppressWarnings("unchecked") public int run() throws IOException { @@ -125,28 +120,13 @@ public int run() throws IOException { outFS.delete(outPath); } - Configuration conf = new Configuration(getConf()); - - if (confProperties != null) { - for (String prop : confProperties) { - String[] parts = prop.split("=", 2); - if (parts.length != 2) { - throw new IllegalArgumentException("Configuration property must be in format key=value: " + prop); - } - String key = parts[0].trim(); - String value = parts[1].trim(); - conf.set(key, value); - console.debug("Set configuration property: {}={}", key, value); - } - } - Iterable reader = openDataFile(source, projection); boolean threw = true; long count = 0; try { try (ParquetWriter writer = AvroParquetWriter.builder(qualifiedPath(outputPath)) .withWriterVersion(v2 ? PARQUET_2_0 : PARQUET_1_0) - .withConf(conf) + .withConf(getConf()) .withCompressionCodec(codec) .withRowGroupSize(rowGroupSize) .withDictionaryPageSize(dictionaryPageSize < 64 ? 64 : dictionaryPageSize) diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCSVCommandTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCSVCommandTest.java index e6901464e9..29ed16224b 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCSVCommandTest.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCSVCommandTest.java @@ -69,9 +69,10 @@ public void testConvertCSVCommandWithGenericConf() throws IOException { command.targets = Arrays.asList(file.getAbsolutePath()); File output = new File(getTempFolder(), getClass().getSimpleName() + "_with_generic_conf.parquet"); command.outputPath = output.getAbsolutePath(); - command.confProperties = - Arrays.asList("parquet.avro.write-parquet-uuid=true", "parquet.avro.write-old-list-structure=false"); - command.setConf(new Configuration()); + Configuration conf = new Configuration(); + conf.set("parquet.avro.write-parquet-uuid", "true"); + conf.set("parquet.avro.write-old-list-structure", "false"); + command.setConf(conf); Assert.assertEquals(0, command.run()); Assert.assertTrue(output.exists()); } diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCommandTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCommandTest.java index f12b3c684e..c38a5b25bf 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCommandTest.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ConvertCommandTest.java @@ -45,11 +45,11 @@ public void testConvertCommandWithGenericConf() throws IOException { command.targets = Arrays.asList(file.getAbsolutePath()); File output = new File(getTempFolder(), "converted_with_generic_conf.parquet"); command.outputPath = output.getAbsolutePath(); - command.confProperties = Arrays.asList( - "parquet.avro.write-parquet-uuid=true", - "parquet.avro.write-old-list-structure=false", - "test.property=test.value"); - command.setConf(new Configuration()); + Configuration conf = new Configuration(); + conf.set("parquet.avro.write-parquet-uuid", "true"); + conf.set("parquet.avro.write-old-list-structure", "false"); + conf.set("test.property", "test.value"); + command.setConf(conf); Assert.assertEquals(0, command.run()); Assert.assertTrue(output.exists()); @@ -63,32 +63,22 @@ public void testConvertCommandConfigurationValidation() throws IOException { File output = new File(getTempFolder(), "converted_with_config_validation.parquet"); command.outputPath = output.getAbsolutePath(); - command.confProperties = - Arrays.asList("parquet.avro.write-parquet-uuid=true", "parquet.avro.write-old-list-structure=false"); - - command.setConf(new Configuration()); + Configuration conf = new Configuration(); + conf.set("parquet.avro.write-parquet-uuid", "true"); + conf.set("parquet.avro.write-old-list-structure", "false"); + command.setConf(conf); Assert.assertEquals(0, command.run()); Assert.assertTrue(output.exists()); File output2 = new File(getTempFolder(), "converted_with_config_validation2.parquet"); command.outputPath = output2.getAbsolutePath(); - command.confProperties = - Arrays.asList("parquet.avro.write-parquet-uuid=false", "parquet.avro.write-old-list-structure=true"); + Configuration conf2 = new Configuration(); + conf2.set("parquet.avro.write-parquet-uuid", "false"); + conf2.set("parquet.avro.write-old-list-structure", "true"); + command.setConf(conf2); Assert.assertEquals(0, command.run()); Assert.assertTrue(output2.exists()); } - - @Test(expected = IllegalArgumentException.class) - public void testConvertCommandWithInvalidConf() throws IOException { - File file = toAvro(parquetFile()); - ConvertCommand command = new ConvertCommand(createLogger()); - command.targets = Arrays.asList(file.getAbsolutePath()); - File output = new File(getTempFolder(), "converted_with_invalid_conf.parquet"); - command.outputPath = output.getAbsolutePath(); - command.confProperties = Arrays.asList("invalid-property-format"); - command.setConf(new Configuration()); - command.run(); - } } From 76e9b252ba46d440f26389b9d2aa442e59742e4c Mon Sep 17 00:00:00 2001 From: arnavb Date: Thu, 28 Aug 2025 11:31:50 +0000 Subject: [PATCH 5/6] address comments --- .../java/org/apache/parquet/cli/commands/ConvertCommand.java | 1 - 1 file changed, 1 deletion(-) diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java index 54c21edfb9..c92527c5d0 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java @@ -34,7 +34,6 @@ import java.util.List; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.parquet.avro.AvroParquetWriter; From 9ae130e23aeed34ab82398389ca1d8ab5ef27953 Mon Sep 17 00:00:00 2001 From: arnavb Date: Thu, 28 Aug 2025 12:04:55 +0000 Subject: [PATCH 6/6] lint --- parquet-cli/src/main/java/org/apache/parquet/cli/Main.java | 3 +-- .../org/apache/parquet/cli/commands/ConvertCSVCommand.java | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java index 57b32d670d..e93a21e899 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java @@ -25,8 +25,8 @@ import com.beust.jcommander.Parameters; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableSet; -import java.util.Set; import java.util.List; +import java.util.Set; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; @@ -206,7 +206,6 @@ public int run(String[] args) throws Exception { console.error("Unknown error", e); return 1; } - } public static void main(String[] args) throws Exception { diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java index 4de811a0e0..d7249eba65 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java @@ -36,7 +36,6 @@ import org.apache.avro.Schema; import org.apache.avro.SchemaNormalization; import org.apache.avro.generic.GenericData; -import org.apache.hadoop.conf.Configuration; import org.apache.parquet.avro.AvroParquetWriter; import org.apache.parquet.cli.BaseCommand; import org.apache.parquet.cli.csv.AvroCSV;