From dcd1c33f0dba247b43418b922c1c3a2fc432dc11 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Tue, 8 Dec 2015 10:15:30 -0800 Subject: [PATCH] PARQUET-352: Add object model property to file footers. WriteSupport now has a getName getter method that is added to the footer if it returns a non-null string as writer.model.name. This is intended to help identify files written by object models incorrectly. Author: Ryan Blue Closes #289 from rdblue/PARQUET-352-add-object-model-property and squashes the following commits: 23f8f67 [Ryan Blue] PARQUET-352: Add object model property to file footers. --- .../org/apache/parquet/avro/AvroWriteSupport.java | 5 +++++ .../apache/parquet/cascading/TupleWriteSupport.java | 5 +++++ .../parquet/hadoop/InternalParquetRecordWriter.java | 4 ++++ .../org/apache/parquet/hadoop/ParquetWriter.java | 2 ++ .../parquet/hadoop/api/DelegatingWriteSupport.java | 5 +++++ .../org/apache/parquet/hadoop/api/WriteSupport.java | 12 ++++++++++++ .../parquet/hadoop/example/GroupWriteSupport.java | 5 +++++ .../org/apache/parquet/hadoop/TestParquetWriter.java | 3 +++ .../org/apache/parquet/pig/TupleWriteSupport.java | 5 +++++ .../org/apache/parquet/proto/ProtoWriteSupport.java | 5 +++++ .../apache/parquet/scrooge/ScroogeWriteSupport.java | 5 +++++ .../parquet/hadoop/thrift/TBaseWriteSupport.java | 5 +++++ .../hadoop/thrift/ThriftBytesWriteSupport.java | 5 +++++ .../parquet/hadoop/thrift/ThriftWriteSupport.java | 5 +++++ .../thrift/pig/TupleToThriftWriteSupport.java | 5 +++++ 15 files changed, 76 insertions(+) diff --git a/parquet-avro/src/main/java/org/apache/parquet/avro/AvroWriteSupport.java b/parquet-avro/src/main/java/org/apache/parquet/avro/AvroWriteSupport.java index 48fc01ebf2..c75bb032f6 100644 --- a/parquet-avro/src/main/java/org/apache/parquet/avro/AvroWriteSupport.java +++ b/parquet-avro/src/main/java/org/apache/parquet/avro/AvroWriteSupport.java @@ -92,6 +92,11 @@ public AvroWriteSupport(MessageType schema, Schema avroSchema, this.model = model; } + @Override + public String getName() { + return "avro"; + } + /** * @see org.apache.parquet.avro.AvroParquetOutputFormat#setSchema(org.apache.hadoop.mapreduce.Job, org.apache.avro.Schema) */ diff --git a/parquet-cascading/src/main/java/org/apache/parquet/cascading/TupleWriteSupport.java b/parquet-cascading/src/main/java/org/apache/parquet/cascading/TupleWriteSupport.java index 2489b2e5db..032f534870 100644 --- a/parquet-cascading/src/main/java/org/apache/parquet/cascading/TupleWriteSupport.java +++ b/parquet-cascading/src/main/java/org/apache/parquet/cascading/TupleWriteSupport.java @@ -41,6 +41,11 @@ public class TupleWriteSupport extends WriteSupport { private MessageType rootSchema; public static final String PARQUET_CASCADING_SCHEMA = "parquet.cascading.schema"; + @Override + public String getName() { + return "cascading"; + } + @Override public WriteContext init(Configuration configuration) { String schema = configuration.get(PARQUET_CASCADING_SCHEMA); diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java index 87b23a2c0d..2b1d48b041 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java @@ -119,6 +119,10 @@ public void close() throws IOException, InterruptedException { flushRowGroupToStore(); FinalizedWriteContext finalWriteContext = writeSupport.finalizeWrite(); Map finalMetadata = new HashMap(extraMetaData); + String modelName = writeSupport.getName(); + if (modelName != null) { + finalMetadata.put(ParquetWriter.OBJECT_MODEL_NAME_PROP, modelName); + } finalMetadata.putAll(finalWriteContext.getExtraMetaData()); parquetFileWriter.end(finalMetadata); } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java index e2521fb090..be8c0cd78b 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java @@ -45,6 +45,8 @@ public class ParquetWriter implements Closeable { public static final WriterVersion DEFAULT_WRITER_VERSION = WriterVersion.PARQUET_1_0; + public static final String OBJECT_MODEL_NAME_PROP = "writer.model.name"; + // max size (bytes) to write as padding and the min size of a row group public static final int MAX_PADDING_SIZE_DEFAULT = 0; diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/api/DelegatingWriteSupport.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/api/DelegatingWriteSupport.java index 207bb1a880..66a4b01a91 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/api/DelegatingWriteSupport.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/api/DelegatingWriteSupport.java @@ -54,6 +54,11 @@ public void write(T record) { delegate.write(record); } + @Override + public String getName() { + return delegate.getName(); + } + @Override public WriteSupport.FinalizedWriteContext finalizeWrite() { return delegate.finalizeWrite(); diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/api/WriteSupport.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/api/WriteSupport.java index 91c37c3640..1a61faa775 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/api/WriteSupport.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/api/WriteSupport.java @@ -120,6 +120,18 @@ public Map getExtraMetaData() { */ public abstract void write(T record); + /** + * Called to get a name to identify the WriteSupport object model. + * If not null, this is added to the file footer metadata. + *

+ * Defining this method will be required in a future API version. + * + * @return a String name for file metadata. + */ + public String getName() { + return null; + } + /** * called once in the end after the last record was written * @return information to be added in the file diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/example/GroupWriteSupport.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/example/GroupWriteSupport.java index ee59a6eda6..c038f255bc 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/example/GroupWriteSupport.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/example/GroupWriteSupport.java @@ -63,6 +63,11 @@ public GroupWriteSupport() { this.extraMetaData = extraMetaData; } + @Override + public String getName() { + return "example"; + } + @Override public org.apache.parquet.hadoop.api.WriteSupport.WriteContext init(Configuration configuration) { // if present, prefer the schema passed to the constructor diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java index e327643572..6fc3c72f84 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java @@ -135,6 +135,9 @@ public void test() throws Exception { } } } + assertEquals("Object model property should be example", + "example", footer.getFileMetaData().getKeyValueMetaData() + .get(ParquetWriter.OBJECT_MODEL_NAME_PROP)); } } } diff --git a/parquet-pig/src/main/java/org/apache/parquet/pig/TupleWriteSupport.java b/parquet-pig/src/main/java/org/apache/parquet/pig/TupleWriteSupport.java index 829fe7072a..2cf676c357 100644 --- a/parquet-pig/src/main/java/org/apache/parquet/pig/TupleWriteSupport.java +++ b/parquet-pig/src/main/java/org/apache/parquet/pig/TupleWriteSupport.java @@ -67,6 +67,11 @@ public TupleWriteSupport(Schema pigSchema) { this.rootPigSchema = pigSchema; } + @Override + public String getName() { + return "pig"; + } + public Schema getPigSchema() { return rootPigSchema; } diff --git a/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoWriteSupport.java b/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoWriteSupport.java index 40e36d5764..d7f7a53512 100644 --- a/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoWriteSupport.java +++ b/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoWriteSupport.java @@ -62,6 +62,11 @@ public ProtoWriteSupport(Class protobufClass) { this.protoMessage = protobufClass; } + @Override + public String getName() { + return "protobuf"; + } + public static void setSchema(Configuration configuration, Class protoClass) { configuration.setClass(PB_CLASS_WRITE, protoClass, Message.class); } diff --git a/parquet-scrooge/src/main/java/org/apache/parquet/scrooge/ScroogeWriteSupport.java b/parquet-scrooge/src/main/java/org/apache/parquet/scrooge/ScroogeWriteSupport.java index a478bf7758..7b72b735af 100644 --- a/parquet-scrooge/src/main/java/org/apache/parquet/scrooge/ScroogeWriteSupport.java +++ b/parquet-scrooge/src/main/java/org/apache/parquet/scrooge/ScroogeWriteSupport.java @@ -48,6 +48,11 @@ public ScroogeWriteSupport(Class thriftClass) { super(thriftClass); } + @Override + public String getName() { + return "scrooge"; + } + @Override protected StructType getThriftStruct() { ScroogeStructConverter schemaConverter = new ScroogeStructConverter(); diff --git a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/TBaseWriteSupport.java b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/TBaseWriteSupport.java index b45727829d..56bf2991cd 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/TBaseWriteSupport.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/TBaseWriteSupport.java @@ -45,6 +45,11 @@ public TBaseWriteSupport(Class thriftClass) { super(thriftClass); } + @Override + public String getName() { + return "thrift"; + } + @Override protected StructType getThriftStruct() { return ThriftSchemaConverter.toStructType(thriftClass); diff --git a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java index 6db769ecb7..f6f511b813 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/hadoop/thrift/ThriftBytesWriteSupport.java @@ -92,6 +92,11 @@ public ThriftBytesWriteSupport(TProtocolFactory protocolFactory, Class thriftClass) { this.writeSupport = new TBaseWriteSupport(thriftClass); } + @Override + public String getName() { + return writeSupport.getName(); + } + @Override public WriteContext init(Configuration configuration) { return this.writeSupport.init(configuration); diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/pig/TupleToThriftWriteSupport.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/pig/TupleToThriftWriteSupport.java index 53fc16dda5..b8add82297 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/pig/TupleToThriftWriteSupport.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/pig/TupleToThriftWriteSupport.java @@ -49,6 +49,11 @@ public TupleToThriftWriteSupport(String className) { this.className = className; } + @Override + public String getName() { + return "thrift"; + } + @SuppressWarnings({"rawtypes", "unchecked"}) @Override public WriteContext init(Configuration configuration) {