@@ -274,18 +274,16 @@ public StreamSync(HoodieStreamer.Config cfg, SparkSession sparkSession, SchemaPr
274
274
this .processedSchema = new SchemaSet ();
275
275
this .autoGenerateRecordKeys = KeyGenUtils .enableAutoGenerateRecordKeys (props );
276
276
this .keyGenClassName = getKeyGeneratorClassName (new TypedProperties (props ));
277
- refreshTimeline ();
278
- // Register User Provided schema first
279
- registerAvroSchemas (schemaProvider );
280
-
281
-
282
- this .metrics = (HoodieIngestionMetrics ) ReflectionUtils .loadClass (cfg .ingestionMetricsClass , getHoodieClientConfig (this .schemaProvider ));
283
- this .hoodieMetrics = new HoodieMetrics (getHoodieClientConfig (this .schemaProvider ));
284
277
this .conf = conf ;
278
+
279
+ HoodieWriteConfig hoodieWriteConfig = getHoodieClientConfig ();
280
+ this .metrics = (HoodieIngestionMetrics ) ReflectionUtils .loadClass (cfg .ingestionMetricsClass , hoodieWriteConfig );
281
+ this .hoodieMetrics = new HoodieMetrics (hoodieWriteConfig );
285
282
if (props .getBoolean (ERROR_TABLE_ENABLED .key (), ERROR_TABLE_ENABLED .defaultValue ())) {
286
283
this .errorTableWriter = ErrorTableUtils .getErrorTableWriter (cfg , sparkSession , props , hoodieSparkContext , fs );
287
284
this .errorWriteFailureStrategy = ErrorTableUtils .getErrorWriteFailureStrategy (props );
288
285
}
286
+ refreshTimeline ();
289
287
Source source = UtilHelpers .createSource (cfg .sourceClassName , props , hoodieSparkContext .jsc (), sparkSession , schemaProvider , metrics );
290
288
this .formatAdapter = new SourceFormatAdapter (source , this .errorTableWriter , Option .of (props ));
291
289
@@ -309,7 +307,7 @@ public void refreshTimeline() throws IOException {
309
307
if (fs .exists (new Path (cfg .targetBasePath ))) {
310
308
try {
311
309
HoodieTableMetaClient meta = HoodieTableMetaClient .builder ()
312
- .setConf (new Configuration ( fs . getConf ()) )
310
+ .setConf (conf )
313
311
.setBasePath (cfg .targetBasePath )
314
312
.setPayloadClassName (cfg .payloadClassName )
315
313
.setRecordMergerStrategy (props .getProperty (HoodieWriteConfig .RECORD_MERGER_STRATEGY .key (), HoodieWriteConfig .RECORD_MERGER_STRATEGY .defaultValue ()))
@@ -337,7 +335,7 @@ public void refreshTimeline() throws IOException {
337
335
LOG .warn ("Base path exists, but table is not fully initialized. Re-initializing again" );
338
336
initializeEmptyTable ();
339
337
// reload the timeline from metaClient and validate that its empty table. If there are any instants found, then we should fail the pipeline, bcoz hoodie.properties got deleted by mistake.
340
- HoodieTableMetaClient metaClientToValidate = HoodieTableMetaClient .builder ().setConf (new Configuration ( fs . getConf ()) ).setBasePath (cfg .targetBasePath ).build ();
338
+ HoodieTableMetaClient metaClientToValidate = HoodieTableMetaClient .builder ().setConf (conf ).setBasePath (cfg .targetBasePath ).build ();
341
339
if (metaClientToValidate .reloadActiveTimeline ().countInstants () > 0 ) {
342
340
// Deleting the recreated hoodie.properties and throwing exception.
343
341
fs .delete (new Path (String .format ("%s%s/%s" , basePathWithForwardSlash , HoodieTableMetaClient .METAFOLDER_NAME , HoodieTableConfig .HOODIE_PROPERTIES_FILE )));
@@ -395,7 +393,7 @@ public Pair<Option<String>, JavaRDD<WriteStatus>> syncOnce() throws IOException
395
393
// Refresh Timeline
396
394
refreshTimeline ();
397
395
HoodieTableMetaClient metaClient = HoodieTableMetaClient .builder ()
398
- .setConf (new Configuration ( fs . getConf ()) )
396
+ .setConf (conf )
399
397
.setBasePath (cfg .targetBasePath )
400
398
.setRecordMergerStrategy (props .getProperty (HoodieWriteConfig .RECORD_MERGER_STRATEGY .key (), HoodieWriteConfig .RECORD_MERGER_STRATEGY .defaultValue ()))
401
399
.setTimeGeneratorConfig (HoodieTimeGeneratorConfig .newBuilder ().fromProperties (props ).withPath (cfg .targetBasePath ).build ())
@@ -432,15 +430,15 @@ public Pair<Option<String>, JavaRDD<WriteStatus>> syncOnce() throws IOException
432
430
}
433
431
434
432
// complete the pending compaction before writing to sink
435
- if (cfg .retryLastPendingInlineCompactionJob && getHoodieClientConfig ( this . schemaProvider ).inlineCompactionEnabled ()) {
433
+ if (cfg .retryLastPendingInlineCompactionJob && writeClient . getConfig ( ).inlineCompactionEnabled ()) {
436
434
Option <String > pendingCompactionInstant = getLastPendingCompactionInstant (allCommitsTimelineOpt );
437
435
if (pendingCompactionInstant .isPresent ()) {
438
436
HoodieWriteMetadata <JavaRDD <WriteStatus >> writeMetadata = writeClient .compact (pendingCompactionInstant .get ());
439
437
writeClient .commitCompaction (pendingCompactionInstant .get (), writeMetadata .getCommitMetadata ().get (), Option .empty ());
440
438
refreshTimeline ();
441
439
reInitWriteClient (schemaProvider .getSourceSchema (), schemaProvider .getTargetSchema (), null );
442
440
}
443
- } else if (cfg .retryLastPendingInlineClusteringJob && getHoodieClientConfig ( this . schemaProvider ).inlineClusteringEnabled ()) {
441
+ } else if (cfg .retryLastPendingInlineClusteringJob && writeClient . getConfig ( ).inlineClusteringEnabled ()) {
444
442
// complete the pending clustering before writing to sink
445
443
Option <String > pendingClusteringInstant = getLastPendingClusteringInstant (allCommitsTimelineOpt );
446
444
if (pendingClusteringInstant .isPresent ()) {
@@ -1001,7 +999,7 @@ public void runMetaSync() {
1001
999
* this constraint.
1002
1000
*/
1003
1001
private void setupWriteClient (Option <JavaRDD <HoodieRecord >> recordsOpt ) throws IOException {
1004
- if (( null != schemaProvider ) ) {
1002
+ if (null != schemaProvider ) {
1005
1003
Schema sourceSchema = schemaProvider .getSourceSchema ();
1006
1004
Schema targetSchema = schemaProvider .getTargetSchema ();
1007
1005
reInitWriteClient (sourceSchema , targetSchema , recordsOpt );
@@ -1013,8 +1011,9 @@ private void reInitWriteClient(Schema sourceSchema, Schema targetSchema, Option<
1013
1011
if (HoodieStreamerUtils .isDropPartitionColumns (props )) {
1014
1012
targetSchema = HoodieAvroUtils .removeFields (targetSchema , HoodieStreamerUtils .getPartitionColumns (props ));
1015
1013
}
1016
- registerAvroSchemas (sourceSchema , targetSchema );
1017
- final HoodieWriteConfig initialWriteConfig = getHoodieClientConfig (targetSchema );
1014
+ final Pair <HoodieWriteConfig , Schema > initialWriteConfigAndSchema = getHoodieClientConfigAndWriterSchema (targetSchema , true );
1015
+ final HoodieWriteConfig initialWriteConfig = initialWriteConfigAndSchema .getLeft ();
1016
+ registerAvroSchemas (sourceSchema , initialWriteConfigAndSchema .getRight ());
1018
1017
final HoodieWriteConfig writeConfig = SparkSampleWritesUtils
1019
1018
.getWriteConfigWithRecordSizeEstimate (hoodieSparkContext .jsc (), recordsOpt , initialWriteConfig )
1020
1019
.orElse (initialWriteConfig );
@@ -1036,20 +1035,21 @@ private void reInitWriteClient(Schema sourceSchema, Schema targetSchema, Option<
1036
1035
}
1037
1036
1038
1037
/**
1039
- * Helper to construct Write Client config.
1040
- *
1041
- * @param schemaProvider Schema Provider
1038
+ * Helper to construct Write Client config without a schema.
1042
1039
*/
1043
- private HoodieWriteConfig getHoodieClientConfig (SchemaProvider schemaProvider ) {
1044
- return getHoodieClientConfig ( schemaProvider != null ? schemaProvider . getTargetSchema () : null );
1040
+ private HoodieWriteConfig getHoodieClientConfig () {
1041
+ return getHoodieClientConfigAndWriterSchema ( null , false ). getLeft ( );
1045
1042
}
1046
1043
1047
1044
/**
1048
1045
* Helper to construct Write Client config.
1049
1046
*
1050
- * @param schema Schema
1047
+ * @param schema initial writer schema. If null or Avro Null type, the schema will be fetched from previous commit metadata for the table.
1048
+ * @param requireSchemaInConfig whether the schema should be present in the config. This is an optimization to avoid fetching schema from previous commits if not needed.
1049
+ *
1050
+ * @return Pair of HoodieWriteConfig and writer schema.
1051
1051
*/
1052
- private HoodieWriteConfig getHoodieClientConfig (Schema schema ) {
1052
+ private Pair < HoodieWriteConfig , Schema > getHoodieClientConfigAndWriterSchema (Schema schema , boolean requireSchemaInConfig ) {
1053
1053
final boolean combineBeforeUpsert = true ;
1054
1054
final boolean autoCommit = false ;
1055
1055
@@ -1075,8 +1075,13 @@ private HoodieWriteConfig getHoodieClientConfig(Schema schema) {
1075
1075
.withAutoCommit (autoCommit )
1076
1076
.withProps (props );
1077
1077
1078
- if (schema != null ) {
1079
- builder .withSchema (getSchemaForWriteConfig (schema ).toString ());
1078
+ // If schema is required in the config, we need to handle the case where the target schema is null and should be fetched from previous commits
1079
+ final Schema returnSchema ;
1080
+ if (requireSchemaInConfig ) {
1081
+ returnSchema = getSchemaForWriteConfig (schema );
1082
+ builder .withSchema (returnSchema .toString ());
1083
+ } else {
1084
+ returnSchema = schema ;
1080
1085
}
1081
1086
1082
1087
HoodieWriteConfig config = builder .build ();
@@ -1108,30 +1113,28 @@ private HoodieWriteConfig getHoodieClientConfig(Schema schema) {
1108
1113
String .format ("%s should be set to %s" , COMBINE_BEFORE_INSERT .key (), cfg .filterDupes ));
1109
1114
ValidationUtils .checkArgument (config .shouldCombineBeforeUpsert (),
1110
1115
String .format ("%s should be set to %s" , COMBINE_BEFORE_UPSERT .key (), combineBeforeUpsert ));
1111
- return config ;
1116
+ return Pair . of ( config , returnSchema ) ;
1112
1117
}
1113
1118
1114
1119
private Schema getSchemaForWriteConfig (Schema targetSchema ) {
1115
1120
Schema newWriteSchema = targetSchema ;
1116
1121
try {
1117
- if (targetSchema != null ) {
1118
- // check if targetSchema is equal to NULL schema
1119
- if (SchemaCompatibility .checkReaderWriterCompatibility (targetSchema , InputBatch .NULL_SCHEMA ).getType () == SchemaCompatibility .SchemaCompatibilityType .COMPATIBLE
1120
- && SchemaCompatibility .checkReaderWriterCompatibility (InputBatch .NULL_SCHEMA , targetSchema ).getType () == SchemaCompatibility .SchemaCompatibilityType .COMPATIBLE ) {
1121
- // target schema is null. fetch schema from commit metadata and use it
1122
- HoodieTableMetaClient meta = HoodieTableMetaClient .builder ().setConf (new Configuration (fs .getConf ()))
1123
- .setBasePath (cfg .targetBasePath )
1124
- .setPayloadClassName (cfg .payloadClassName )
1125
- .build ();
1126
- int totalCompleted = meta .getActiveTimeline ().getCommitsTimeline ().filterCompletedInstants ().countInstants ();
1127
- if (totalCompleted > 0 ) {
1128
- TableSchemaResolver schemaResolver = new TableSchemaResolver (meta );
1129
- Option <Schema > tableSchema = schemaResolver .getTableAvroSchemaIfPresent (false );
1130
- if (tableSchema .isPresent ()) {
1131
- newWriteSchema = tableSchema .get ();
1132
- } else {
1133
- LOG .warn ("Could not fetch schema from table. Falling back to using target schema from schema provider" );
1134
- }
1122
+ // check if targetSchema is equal to NULL schema
1123
+ if (targetSchema == null || (SchemaCompatibility .checkReaderWriterCompatibility (targetSchema , InputBatch .NULL_SCHEMA ).getType () == SchemaCompatibility .SchemaCompatibilityType .COMPATIBLE
1124
+ && SchemaCompatibility .checkReaderWriterCompatibility (InputBatch .NULL_SCHEMA , targetSchema ).getType () == SchemaCompatibility .SchemaCompatibilityType .COMPATIBLE )) {
1125
+ // target schema is null. fetch schema from commit metadata and use it
1126
+ HoodieTableMetaClient meta = HoodieTableMetaClient .builder ().setConf (conf )
1127
+ .setBasePath (cfg .targetBasePath )
1128
+ .setPayloadClassName (cfg .payloadClassName )
1129
+ .build ();
1130
+ int totalCompleted = meta .getActiveTimeline ().getCommitsTimeline ().filterCompletedInstants ().countInstants ();
1131
+ if (totalCompleted > 0 ) {
1132
+ TableSchemaResolver schemaResolver = new TableSchemaResolver (meta );
1133
+ Option <Schema > tableSchema = schemaResolver .getTableAvroSchemaIfPresent (false );
1134
+ if (tableSchema .isPresent ()) {
1135
+ newWriteSchema = tableSchema .get ();
1136
+ } else {
1137
+ LOG .warn ("Could not fetch schema from table. Falling back to using target schema from schema provider" );
1135
1138
}
1136
1139
}
1137
1140
}
0 commit comments