Skip to content

Commit

Permalink
🎉 S3 destination: Avro & Jsonl output (airbytehq#4227)
Browse files Browse the repository at this point in the history
* Add jsonl format to spec.json

* Implement jsonl writer

* Add documentation

* Add acceptance test

* Update document

* Bump version

* Update document example

* Implement avro writer

* Implement compression codec

* Update documentation

* Revise documentation

* Add more tests

* Add acceptance test

* Format code

* Create helper method for name updater

* Update csv doc with normalization

* Update version date
  • Loading branch information
tuliren authored Jun 23, 2021
1 parent 0e5ec6a commit e2c5b1d
Show file tree
Hide file tree
Showing 33 changed files with 1,233 additions and 200 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
"destinationDefinitionId": "4816b78f-1489-44c1-9060-4b19d5fa9362",
"name": "S3",
"dockerRepository": "airbyte/destination-s3",
"dockerImageTag": "0.1.6",
"dockerImageTag": "0.1.7",
"documentationUrl": "https://docs.airbyte.io/integrations/destinations/s3"
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
- destinationDefinitionId: 4816b78f-1489-44c1-9060-4b19d5fa9362
name: S3
dockerRepository: airbyte/destination-s3
dockerImageTag: 0.1.6
dockerImageTag: 0.1.7
documentationUrl: https://docs.airbyte.io/integrations/destinations/s3
- destinationDefinitionId: f7a7d195-377f-cf5b-70a5-be6b819019dc
name: Redshift
Expand Down
3 changes: 2 additions & 1 deletion airbyte-integrations/builds.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,5 +125,6 @@

Redshift [![destination-redshift](https://img.shields.io/endpoint?url=https%3A%2F%2Fstatus-api.airbyte.io%2Ftests%2Fsummary%2Fdestination-redshift%2Fbadge.json)](https://status-api.airbyte.io/tests/summary/destination-redshift)

Snowflake [![destination-snowflake](https://img.shields.io/endpoint?url=https%3A%2F%2Fstatus-api.airbyte.io%2Ftests%2Fsummary%2Fdestination-snowflake%2Fbadge.json)](https://status-api.airbyte.io/tests/summary/destination-snowflake)
S3 [![destination-s3](https://img.shields.io/endpoint?url=https%3A%2F%2Fstatus-api.airbyte.io%2Ftests%2Fsummary%2Fdestination-s3%2Fbadge.json)](https://status-api.airbyte.io/tests/summary/destination-s3)

Snowflake [![destination-snowflake](https://img.shields.io/endpoint?url=https%3A%2F%2Fstatus-api.airbyte.io%2Ftests%2Fsummary%2Fdestination-snowflake%2Fbadge.json)](https://status-api.airbyte.io/tests/summary/destination-snowflake)
2 changes: 1 addition & 1 deletion airbyte-integrations/connectors/destination-s3/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ COPY build/distributions/${APPLICATION}*.tar ${APPLICATION}.tar

RUN tar xf ${APPLICATION}.tar --strip-components=1

LABEL io.airbyte.version=0.1.6
LABEL io.airbyte.version=0.1.7
LABEL io.airbyte.name=airbyte/destination-s3
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,7 @@ protected void acceptTracked(AirbyteMessage airbyteMessage) throws Exception {
Jsons.serialize(configuredCatalog), Jsons.serialize(recordMessage)));
}

UUID id = UUID.randomUUID();
streamNameAndNamespaceToWriters.get(pair).write(id, recordMessage);
streamNameAndNamespaceToWriters.get(pair).write(UUID.randomUUID(), recordMessage);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@

public enum S3Format {

AVRO("avro"),
CSV("csv"),
JSONL("jsonl"),
PARQUET("parquet");

private final String fileExtension;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@

import com.fasterxml.jackson.databind.JsonNode;
import io.airbyte.commons.json.Jsons;
import io.airbyte.integrations.destination.s3.avro.S3AvroFormatConfig;
import io.airbyte.integrations.destination.s3.csv.S3CsvFormatConfig;
import io.airbyte.integrations.destination.s3.jsonl.S3JsonlFormatConfig;
import io.airbyte.integrations.destination.s3.parquet.S3ParquetFormatConfig;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -41,9 +43,15 @@ public static S3FormatConfig getS3FormatConfig(JsonNode config) {
S3Format formatType = S3Format.valueOf(formatConfig.get("format_type").asText().toUpperCase());

switch (formatType) {
case AVRO -> {
return new S3AvroFormatConfig(formatConfig);
}
case CSV -> {
return new S3CsvFormatConfig(formatConfig);
}
case JSONL -> {
return new S3JsonlFormatConfig();
}
case PARQUET -> {
return new S3ParquetFormatConfig(formatConfig);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* MIT License
*
* Copyright (c) 2020 Airbyte
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

package io.airbyte.integrations.destination.s3.avro;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectWriter;
import com.fasterxml.jackson.databind.node.ObjectNode;
import io.airbyte.commons.jackson.MoreMappers;
import io.airbyte.integrations.base.JavaBaseConstants;
import io.airbyte.protocol.models.AirbyteRecordMessage;
import java.util.UUID;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import tech.allegro.schema.json2avro.converter.JsonAvroConverter;

public class AvroRecordFactory {

private static final ObjectMapper MAPPER = MoreMappers.initMapper();
private static final ObjectWriter WRITER = MAPPER.writer();

private final Schema schema;
private final JsonFieldNameUpdater nameUpdater;
private final JsonAvroConverter converter = new JsonAvroConverter();

public AvroRecordFactory(Schema schema, JsonFieldNameUpdater nameUpdater) {
this.schema = schema;
this.nameUpdater = nameUpdater;
}

public GenericData.Record getAvroRecord(UUID id, AirbyteRecordMessage recordMessage) throws JsonProcessingException {
JsonNode inputData = recordMessage.getData();
inputData = nameUpdater.getJsonWithStandardizedFieldNames(inputData);

ObjectNode jsonRecord = MAPPER.createObjectNode();
jsonRecord.put(JavaBaseConstants.COLUMN_NAME_AB_ID, id.toString());
jsonRecord.put(JavaBaseConstants.COLUMN_NAME_EMITTED_AT, recordMessage.getEmittedAt());
jsonRecord.setAll((ObjectNode) inputData);

return converter.convertToGenericDataRecord(WRITER.writeValueAsBytes(jsonRecord), schema);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
* SOFTWARE.
*/

package io.airbyte.integrations.destination.s3.parquet;
package io.airbyte.integrations.destination.s3.avro;

import com.fasterxml.jackson.databind.JsonNode;
import com.google.common.collect.ImmutableMap;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
* SOFTWARE.
*/

package io.airbyte.integrations.destination.s3.parquet;
package io.airbyte.integrations.destination.s3.avro;

import org.apache.avro.Schema;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
* SOFTWARE.
*/

package io.airbyte.integrations.destination.s3.parquet;
package io.airbyte.integrations.destination.s3.avro;

import com.fasterxml.jackson.databind.JsonNode;
import com.google.common.base.Preconditions;
Expand Down Expand Up @@ -51,7 +51,7 @@
* ones.
* <p/>
* For limitations of this converter, see the README of this connector:
* https://docs.airbyte.io/integrations/destinations/s3#parquet
* https://docs.airbyte.io/integrations/destinations/s3#avro
*/
public class JsonToAvroSchemaConverter {

Expand Down Expand Up @@ -102,8 +102,8 @@ public Schema getAvroSchema(JsonNode jsonSchema,
stdName);
builder = builder.doc(
String.format("%s%s%s",
S3ParquetConstants.DOC_KEY_ORIGINAL_NAME,
S3ParquetConstants.DOC_KEY_VALUE_DELIMITER,
S3AvroConstants.DOC_KEY_ORIGINAL_NAME,
S3AvroConstants.DOC_KEY_VALUE_DELIMITER,
name));
}
if (namespace != null) {
Expand All @@ -130,8 +130,8 @@ public Schema getAvroSchema(JsonNode jsonSchema,
LOGGER.warn("Field name contains illegal character(s) and is standardized: {} -> {}",
fieldName, stdFieldName);
fieldBuilder = fieldBuilder.doc(String.format("%s%s%s",
S3ParquetConstants.DOC_KEY_ORIGINAL_NAME,
S3ParquetConstants.DOC_KEY_VALUE_DELIMITER,
S3AvroConstants.DOC_KEY_ORIGINAL_NAME,
S3AvroConstants.DOC_KEY_VALUE_DELIMITER,
fieldName));
}
assembler = fieldBuilder.type(getNullableFieldTypes(fieldName, fieldDefinition))
Expand Down Expand Up @@ -202,7 +202,7 @@ Schema getNullableFieldTypes(String fieldName, JsonNode fieldDefinition) {
if (nonNullFieldTypes.isEmpty()) {
return Schema.create(Schema.Type.NULL);
} else {
// Mark every field as nullable to prevent missing value exceptions from Parquet.
// Mark every field as nullable to prevent missing value exceptions from Avro / Parquet.
nonNullFieldTypes.add(0, Schema.create(Schema.Type.NULL));
return Schema.createUnion(nonNullFieldTypes);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* MIT License
*
* Copyright (c) 2020 Airbyte
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

package io.airbyte.integrations.destination.s3.avro;

public class S3AvroConstants {

// Field name with special character
public static final String DOC_KEY_VALUE_DELIMITER = ":";
public static final String DOC_KEY_ORIGINAL_NAME = "_airbyte_original_name";

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
/*
* MIT License
*
* Copyright (c) 2020 Airbyte
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

package io.airbyte.integrations.destination.s3.avro;

import com.fasterxml.jackson.databind.JsonNode;
import io.airbyte.integrations.destination.s3.S3Format;
import io.airbyte.integrations.destination.s3.S3FormatConfig;
import org.apache.avro.file.CodecFactory;

public class S3AvroFormatConfig implements S3FormatConfig {

private final CodecFactory codecFactory;

public S3AvroFormatConfig(JsonNode formatConfig) {
this.codecFactory = parseCodecConfig(formatConfig.get("compression_codec"));
}

public static CodecFactory parseCodecConfig(JsonNode compressionCodecConfig) {
if (compressionCodecConfig == null || compressionCodecConfig.isNull()) {
return CodecFactory.nullCodec();
}

JsonNode codecConfig = compressionCodecConfig.get("codec");
if (codecConfig == null || codecConfig.isNull() || !codecConfig.isTextual()) {
return CodecFactory.nullCodec();
}
String codecType = codecConfig.asText();
CompressionCodec codec = CompressionCodec.fromConfigValue(codecConfig.asText());
switch (codec) {
case NULL -> {
return CodecFactory.nullCodec();
}
case DEFLATE -> {
int compressionLevel = getCompressionLevel(compressionCodecConfig, 0, 0, 9);
return CodecFactory.deflateCodec(compressionLevel);
}
case BZIP2 -> {
return CodecFactory.bzip2Codec();
}
case XZ -> {
int compressionLevel = getCompressionLevel(compressionCodecConfig, 6, 0, 9);
return CodecFactory.xzCodec(compressionLevel);
}
case ZSTANDARD -> {
int compressionLevel = getCompressionLevel(compressionCodecConfig, 3, -5, 22);
boolean includeChecksum = getIncludeChecksum(compressionCodecConfig, false);
return CodecFactory.zstandardCodec(compressionLevel, includeChecksum);
}
case SNAPPY -> {
return CodecFactory.snappyCodec();
}
default -> {
throw new IllegalArgumentException("Unsupported compression codec: " + codecType);
}
}
}

public static int getCompressionLevel(JsonNode compressionCodecConfig, int defaultLevel, int minLevel, int maxLevel) {
JsonNode levelConfig = compressionCodecConfig.get("compression_level");
if (levelConfig == null || levelConfig.isNull() || !levelConfig.isIntegralNumber()) {
return defaultLevel;
}
int level = levelConfig.asInt();
if (level < minLevel || level > maxLevel) {
throw new IllegalArgumentException(
String.format("Invalid compression level: %d, expected an integer in range [%d, %d]", level, minLevel, maxLevel));
}
return level;
}

public static boolean getIncludeChecksum(JsonNode compressionCodecConfig, boolean defaultValue) {
JsonNode checksumConfig = compressionCodecConfig.get("include_checksum");
if (checksumConfig == null || checksumConfig.isNumber() || !checksumConfig.isBoolean()) {
return defaultValue;
}
return checksumConfig.asBoolean();
}

public CodecFactory getCodecFactory() {
return codecFactory;
}

@Override
public S3Format getFormat() {
return S3Format.AVRO;
}

public enum CompressionCodec {

NULL("no compression"),
DEFLATE("deflate"),
BZIP2("bzip2"),
XZ("xz"),
ZSTANDARD("zstandard"),
SNAPPY("snappy");

private final String configValue;

CompressionCodec(String configValue) {
this.configValue = configValue;
}

public static CompressionCodec fromConfigValue(String configValue) {
for (CompressionCodec codec : values()) {
if (configValue.equalsIgnoreCase(codec.configValue)) {
return codec;
}
}
throw new IllegalArgumentException("Unknown codec config value: " + configValue);
}

}

}
Loading

0 comments on commit e2c5b1d

Please sign in to comment.