forked from vicenteg/DataflowTemplates
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SpannerToText.java
132 lines (120 loc) · 5.55 KB
/
SpannerToText.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/*
* Copyright (C) 2018 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.teleport.templates;
import com.google.cloud.teleport.templates.common.JavascriptTextTransformer.JavascriptTextTransformerOptions;
import com.google.cloud.teleport.templates.common.JavascriptTextTransformer.TransformTextViaJavascript;
import com.google.cloud.teleport.templates.common.SpannerConverters;
import com.google.cloud.teleport.templates.common.SpannerConverters.SpannerReadOptions;
import com.google.cloud.teleport.templates.common.TextConverters.FilesystemWriteOptions;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.FileSystems;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.io.gcp.spanner.LocalSpannerIO;
import org.apache.beam.sdk.io.gcp.spanner.ReadOperation;
import org.apache.beam.sdk.io.gcp.spanner.SpannerConfig;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.TypeDescriptors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Dataflow template which copies a Spanner table to a Text sink. It exports a Spanner table using
* <a href="https://cloud.google.com/spanner/docs/reads#read_data_in_parallel">Batch API</a>,
* which creates multiple workers in parallel for better performance. The result is written
* to a CSV file in Google Cloud Storage. The table schema file is saved in json format along with
* the exported table.
*
* <p>Schema file sample: { "id":"INT64", "name":"STRING(MAX)" }
*
* <p>A sample run:
*
* <pre>
* mvn compile exec:java \
* -Dexec.mainClass=com.google.cloud.teleport.templates.SpannerToText \
* -Dexec.args="--runner=DataflowRunner \
* --spannerProjectId=projectId \
* --gcpTempLocation=gs://gsTmpLocation \
* --spannerInstanceId=instanceId \
* --spannerDatabaseId=databaseId \
* --spannerTable=table_name \
* --textWritePrefix=gcsOutputPath"
* </pre>
*/
public class SpannerToText {
private static final Logger LOG = LoggerFactory.getLogger(SpannerToText.class);
/**
* Custom PipelineOptions.
*/
public interface SpannerToTextOptions
extends PipelineOptions,
SpannerReadOptions,
JavascriptTextTransformerOptions,
FilesystemWriteOptions {}
/**
* Runs a pipeline which reads in Records from Spanner, passes in the CSV records to a Javascript
* UDF, and writes the CSV to TextIO sink.
*
* @param args arguments to the pipeline
*/
public static void main(String[] args) {
LOG.info("Starting pipeline setup");
PipelineOptionsFactory.register(SpannerToTextOptions.class);
SpannerToTextOptions options =
PipelineOptionsFactory.fromArgs(args).withValidation().as(SpannerToTextOptions.class);
FileSystems.setDefaultPipelineOptions(options);
Pipeline pipeline = Pipeline.create(options);
SpannerConfig spannerConfig =
SpannerConfig.create()
.withHost(options.getSpannerHost())
.withProjectId(options.getSpannerProjectId())
.withInstanceId(options.getSpannerInstanceId())
.withDatabaseId(options.getSpannerDatabaseId());
PTransform<PBegin, PCollection<ReadOperation>> spannerExport =
SpannerConverters.ExportTransformFactory.create(
options.getSpannerTable(), spannerConfig, options.getTextWritePrefix());
PCollection<String> csv =
pipeline
.apply("Create export", spannerExport)
// We need to use SpannerIO.readAll() instead of SpannerIO.read()
// because ValueProvider parameters such as table name required for SpannerIO.read()
// can be read only inside DoFn but SpannerIO.read() is of type
// PTransform<PBegin, Struct>, which prevents prepending it with DoFn that reads these
// parameters at the pipeline execution time.
.apply("Read all records", LocalSpannerIO.readAll().withSpannerConfig(spannerConfig))
.apply(
"Struct To Csv",
MapElements.into(TypeDescriptors.strings())
.via(struct -> (new SpannerConverters.StructCsvPrinter()).print(struct)));
if (options.getJavascriptTextTransformGcsPath().isAccessible()) {
// The UDF function takes a CSV row as an input and produces a transformed CSV row
csv =
csv.apply(
"JavascriptUDF",
TransformTextViaJavascript.newBuilder()
.setFileSystemPath(options.getJavascriptTextTransformGcsPath())
.setFunctionName(options.getJavascriptTextTransformFunctionName())
.build());
}
csv.apply(
"Write to storage", TextIO.write().to(options.getTextWritePrefix()).withSuffix(".csv"));
pipeline.run();
LOG.info("Completed pipeline setup");
}
}