[FLINK-7243][connector] Add ParquetInputFormats for Row, Map, and POJO.

This closes apache#6483.
todd5167 · Feb 25, 2019 · d015ce7 · d015ce7
1 parent 168660a
commit d015ce7
Show file tree

Hide file tree

Showing 22 changed files with 3,274 additions and 517 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,6 +19,7 @@ tmp
 build-target
 flink-end-to-end-tests/flink-datastream-allround-test/src/main/java/org/apache/flink/streaming/tests/avro/
 flink-formats/flink-avro/src/test/java/org/apache/flink/formats/avro/generated/
+flink-formats/flink-parquet/src/test/java/org/apache/flink/formats/parquet/generated/
 flink-runtime-web/web-dashboard/assets/fonts/
 flink-runtime-web/web-dashboard/node_modules/
 flink-runtime-web/web-dashboard/bower_components/

diff --git a/...mats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java b/...mats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java
@@ -0,0 +1,290 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.formats.parquet;
+
+import org.apache.flink.api.common.io.CheckpointableInputFormat;
+import org.apache.flink.api.common.io.FileInputFormat;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.api.java.tuple.Tuple2;
+import org.apache.flink.api.java.typeutils.RowTypeInfo;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.core.fs.FileInputSplit;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.formats.parquet.utils.ParquetRecordReader;
+import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter;
+import org.apache.flink.formats.parquet.utils.RowReadSupport;
+import org.apache.flink.metrics.Counter;
+import org.apache.flink.types.Row;
+import org.apache.flink.util.Preconditions;
+
+import org.apache.parquet.ParquetReadOptions;
+import org.apache.parquet.filter2.compat.FilterCompat;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.io.InputFile;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.Type;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.apache.flink.util.Preconditions.checkNotNull;
+
+/**
+ * The base InputFormat class to read from Parquet files.
+ * For specific return types the {@link #convert(Row)} method need to be implemented.
+ *
+ * <P>Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream},
+ * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors.
+ *
+ * @param <E> The type of record to read.
+ */
+public abstract class ParquetInputFormat<E>
+		extends FileInputFormat<E>
+		implements CheckpointableInputFormat<FileInputSplit, Tuple2<Long, Long>> {
+
+	private static final long serialVersionUID = 1L;
+
+	private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class);
+
+	/**
+	 * The flag to specify whether to skip file splits with wrong schema.
+	 */
+	private boolean skipWrongSchemaFileSplit = false;
+
+	/**
+	 * The flag to specify whether to skip corrupted record.
+	 */
+	private boolean skipCorruptedRecord = false;
+
+	/**
+	 * The flag to track that the current split should be skipped.
+	 */
+	private boolean skipThisSplit = false;
+
+	private TypeInformation[] fieldTypes;
+
+	private String[] fieldNames;
+
+	private transient Counter recordConsumed;
+
+	private transient MessageType expectedFileSchema;
+
+	private transient ParquetRecordReader<Row> parquetRecordReader;
+
+	/**
+	 * Read parquet files with given parquet file schema.
+	 *
+	 * @param path The path of the file to read.
+	 * @param messageType schema of parquet file
+	 */
+
+	protected ParquetInputFormat(Path path, MessageType messageType) {
+		super(path);
+		this.expectedFileSchema = checkNotNull(messageType, "messageType");
+		RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema);
+		this.fieldTypes = rowTypeInfo.getFieldTypes();
+		this.fieldNames = rowTypeInfo.getFieldNames();
+		// read whole parquet file as one file split
+		this.unsplittable = true;
+	}
+
+	@Override
+	public void configure(Configuration parameters) {
+		super.configure(parameters);
+
+		if (!this.skipWrongSchemaFileSplit) {
+			this.skipWrongSchemaFileSplit = parameters.getBoolean(PARQUET_SKIP_WRONG_SCHEMA_SPLITS, false);
+		}
+
+		if (this.skipCorruptedRecord) {
+			this.skipCorruptedRecord = parameters.getBoolean(PARQUET_SKIP_CORRUPTED_RECORD, false);
+		}
+	}
+
+	/**
+	 * Configures the fields to be read and returned by the ParquetInputFormat. Selected fields must be present
+	 * in the configured schema.
+	 *
+	 * @param fieldNames Names of all selected fields.
+	 */
+	public void selectFields(String[] fieldNames) {
+		checkNotNull(fieldNames, "fieldNames");
+		this.fieldNames = fieldNames;
+		RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema);
+		TypeInformation[] selectFieldTypes = new TypeInformation[fieldNames.length];
+		for (int i = 0; i < fieldNames.length; i++) {
+			try {
+				selectFieldTypes[i] = rowTypeInfo.getTypeAt(fieldNames[i]);
+			} catch (IndexOutOfBoundsException e) {
+				throw new IllegalArgumentException(String.format("Fail to access Field %s , "
+						+ "which is not contained in the file schema", fieldNames[i]), e);
+			}
+		}
+		this.fieldTypes = selectFieldTypes;
+	}
+
+	@Override
+	public Tuple2<Long, Long> getCurrentState() {
+		return parquetRecordReader.getCurrentReadPosition();
+	}
+
+	@Override
+	public void open(FileInputSplit split) throws IOException {
+		// reset the flag when open a new split
+		this.skipThisSplit = false;
+		org.apache.hadoop.conf.Configuration configuration = new org.apache.hadoop.conf.Configuration();
+		InputFile inputFile =
+			HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(split.getPath().toUri()), configuration);
+		ParquetReadOptions options = ParquetReadOptions.builder().build();
+		ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);
+		MessageType fileSchema = fileReader.getFileMetaData().getSchema();
+		MessageType readSchema = getReadSchema(fileSchema, split.getPath());
+		if (skipThisSplit) {
+			LOG.warn(String.format(
+				"Escaped the file split [%s] due to mismatch of file schema to expected result schema",
+				split.getPath().toString()));
+		} else {
+			this.parquetRecordReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema, FilterCompat.NOOP);
+			this.parquetRecordReader.initialize(fileReader, configuration);
+			this.parquetRecordReader.setSkipCorruptedRecord(this.skipCorruptedRecord);
+
+			if (this.recordConsumed == null) {
+				this.recordConsumed = getRuntimeContext().getMetricGroup().counter("parquet-records-consumed");
+			}
+
+			LOG.debug(String.format("Open ParquetInputFormat with FileInputSplit [%s]", split.getPath().toString()));
+		}
+	}
+
+	@Override
+	public void reopen(FileInputSplit split, Tuple2<Long, Long> state) throws IOException {
+		Preconditions.checkNotNull(split, "reopen() cannot be called on a null split.");
+		Preconditions.checkNotNull(state, "reopen() cannot be called with a null initial state.");
+		this.open(split);
+		// seek to the read position in the split that we were at when the checkpoint was taken.
+		parquetRecordReader.seek(state.f0, state.f1);
+	}
+
+	/**
+	 * Get field names of read result.
+	 *
+	 * @return field names array
+	 */
+	protected String[] getFieldNames() {
+		return fieldNames;
+	}
+
+	/**
+	 * Get field types of read result.
+	 *
+	 * @return field types array
+	 */
+	protected TypeInformation[] getFieldTypes() {
+		return fieldTypes;
+	}
+
+	@Override
+	public void close() throws IOException {
+		if (parquetRecordReader != null) {
+			parquetRecordReader.close();
+		}
+	}
+
+	@Override
+	public boolean reachedEnd() throws IOException {
+		if (skipThisSplit) {
+			return true;
+		}
+
+		return parquetRecordReader.reachEnd();
+	}
+
+	@Override
+	public E nextRecord(E e) throws IOException {
+		if (reachedEnd()) {
+			return null;
+		}
+
+		recordConsumed.inc();
+		return convert(parquetRecordReader.nextRecord());
+	}
+
+	/**
+	 * This ParquetInputFormat read parquet record as Row by default. Sub classes of it can extend this method
+	 * to further convert row to other types, such as POJO, Map or Tuple.
+	 *
+	 * @param row row read from parquet file
+	 * @return E target result type
+	 */
+	protected abstract E convert(Row row);
+
+	/**
+	 * Generates and returns the read schema based on the projected fields for a given file.
+	 *
+	 * @param fileSchema The schema of the given file.
+	 * @param filePath The path of the given file.
+	 * @return The read schema based on the given file's schema and the projected fields.
+	 */
+	private MessageType getReadSchema(MessageType fileSchema, Path filePath) {
+		RowTypeInfo fileTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(fileSchema);
+		List<Type> types = new ArrayList<>();
+		for (int i = 0; i < fieldNames.length; ++i) {
+			String readFieldName = fieldNames[i];
+			TypeInformation<?> readFieldType = fieldTypes[i];
+			if (fileTypeInfo.getFieldIndex(readFieldName) < 0) {
+				if (!skipWrongSchemaFileSplit) {
+					throw new IllegalArgumentException("Field " + readFieldName + " cannot be found in schema of "
+						+ " Parquet file: " + filePath + ".");
+				} else {
+					this.skipThisSplit = true;
+					return fileSchema;
+				}
+			}
+
+			if (!readFieldType.equals(fileTypeInfo.getTypeAt(readFieldName))) {
+				if (!skipWrongSchemaFileSplit) {
+					throw new IllegalArgumentException("Expecting type " + readFieldType + " for field " + readFieldName
+						+ " but found type " + fileTypeInfo.getTypeAt(readFieldName) + " in Parquet file: "
+						+ filePath + ".");
+				} else {
+					this.skipThisSplit = true;
+					return fileSchema;
+				}
+			}
+			types.add(fileSchema.getType(readFieldName));
+		}
+
+		return new MessageType(fileSchema.getName(), types);
+	}
+
+	/**
+	 * The config parameter which defines whether to skip file split with wrong schema.
+	 */
+	public static final String PARQUET_SKIP_WRONG_SCHEMA_SPLITS = "skip.splits.wrong.schema";
+
+	/**
+	 * The config parameter which defines whether to skip corrupted record.
+	 */
+	public static final String PARQUET_SKIP_CORRUPTED_RECORD = "skip.corrupted.record";
+
+}