/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.io.gcp.bigquery;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createJobIdToken;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString;
import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem;
import static org.hamcrest.Matchers.allOf;
import static org.hamcrest.Matchers.containsInAnyOrder;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasEntry;
import static org.hamcrest.Matchers.hasItem;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
import com.google.api.client.util.Data;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobStatistics;
import com.google.api.services.bigquery.model.JobStatistics2;
import com.google.api.services.bigquery.model.JobStatistics4;
import com.google.api.services.bigquery.model.JobStatus;
import com.google.api.services.bigquery.model.Table;
import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.math.BigDecimal;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ThreadLocalRandom;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.coders.AtomicCoder;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.Coder.Context;
import org.apache.beam.sdk.coders.CoderException;
import org.apache.beam.sdk.coders.IterableCoder;
import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.coders.VarIntCoder;
import org.apache.beam.sdk.io.BoundedSource;
import org.apache.beam.sdk.io.CountingSource;
import org.apache.beam.sdk.io.FileSystems;
import org.apache.beam.sdk.io.GenerateSequence;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.PassThroughThenCleanup.CleanupOperation;
import org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
import org.apache.beam.sdk.testing.CoderProperties;
import org.apache.beam.sdk.testing.ExpectedLogs;
import org.apache.beam.sdk.testing.PAssert;
import org.apache.beam.sdk.testing.SourceTestUtils;
import org.apache.beam.sdk.testing.SourceTestUtils.ExpectedSplitOutcome;
import org.apache.beam.sdk.testing.TestPipeline;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.DoFnTester;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.transforms.View;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.transforms.display.DisplayDataEvaluator;
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
import org.apache.beam.sdk.transforms.windowing.IncompatibleWindowException;
import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
import org.apache.beam.sdk.transforms.windowing.NonMergingWindowFn;
import org.apache.beam.sdk.transforms.windowing.Window;
import org.apache.beam.sdk.transforms.windowing.WindowFn;
import org.apache.beam.sdk.transforms.windowing.WindowMappingFn;
import org.apache.beam.sdk.util.CoderUtils;
import org.apache.beam.sdk.util.MimeTypes;
import org.apache.beam.sdk.util.WindowedValue;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionView;
import org.apache.beam.sdk.values.PCollectionViews;
import org.apache.beam.sdk.values.TupleTag;
import org.apache.beam.sdk.values.TypeDescriptor;
import org.apache.beam.sdk.values.ValueInSingleWindow;
import org.apache.beam.sdk.values.WindowingStrategy;
import org.hamcrest.CoreMatchers;
import org.hamcrest.Matchers;
import org.joda.time.Instant;
import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
import org.junit.rules.TemporaryFolder;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
/**
* Tests for BigQueryIO.
*/
@RunWith(JUnit4.class)
public class BigQueryIOTest implements Serializable {
private static Path tempFolder;
// Table information must be static, as each ParDo will get a separate instance of
// FakeDatasetServices, and they must all modify the same storage.
static com.google.common.collect.Table<String, String, Map<String, TableContainer>>
tables = HashBasedTable.create();
@Rule public final transient TestPipeline p = TestPipeline.create();
@Rule public transient ExpectedException thrown = ExpectedException.none();
@Rule public transient ExpectedLogs loggedBigQueryIO = ExpectedLogs.none(BigQueryIO.class);
@Rule public transient ExpectedLogs loggedWriteRename = ExpectedLogs.none(WriteRename.class);
@Rule public transient ExpectedLogs loggedWriteTables = ExpectedLogs.none(WriteTables.class);
@Rule public transient TemporaryFolder testFolder = new TemporaryFolder();
private void checkReadTableObject(
BigQueryIO.Read read, String project, String dataset, String table) {
checkReadTableObjectWithValidate(read, project, dataset, table, true);
}
private void checkReadQueryObject(BigQueryIO.Read read, String query) {
checkReadQueryObjectWithValidate(read, query, true);
}
private void checkReadTableObjectWithValidate(
BigQueryIO.Read read, String project, String dataset, String table, boolean validate) {
assertEquals(project, read.getTable().getProjectId());
assertEquals(dataset, read.getTable().getDatasetId());
assertEquals(table, read.getTable().getTableId());
assertNull(read.getQuery());
assertEquals(validate, read.getValidate());
}
private void checkReadQueryObjectWithValidate(
BigQueryIO.Read read, String query, boolean validate) {
assertNull(read.getTable());
assertEquals(query, read.getQuery().get());
assertEquals(validate, read.getValidate());
}
private void checkWriteObject(
BigQueryIO.Write write, String project, String dataset, String table,
TableSchema schema, CreateDisposition createDisposition,
WriteDisposition writeDisposition, String tableDescription) {
checkWriteObjectWithValidate(
write,
project,
dataset,
table,
schema,
createDisposition,
writeDisposition,
tableDescription,
true);
}
private void checkWriteObjectWithValidate(
BigQueryIO.Write<TableRow> write, String project, String dataset, String table,
TableSchema schema, CreateDisposition createDisposition,
WriteDisposition writeDisposition, String tableDescription, boolean validate) {
assertEquals(project, write.getTable().get().getProjectId());
assertEquals(dataset, write.getTable().get().getDatasetId());
assertEquals(table, write.getTable().get().getTableId());
if (schema == null) {
assertNull(write.getJsonSchema());
assertNull(write.getSchemaFromView());
} else {
assertEquals(schema, BigQueryHelpers.fromJsonString(
write.getJsonSchema().get(), TableSchema.class));
}
assertEquals(createDisposition, write.getCreateDisposition());
assertEquals(writeDisposition, write.getWriteDisposition());
assertEquals(tableDescription, write.getTableDescription());
assertEquals(validate, write.getValidate());
}
@BeforeClass
public static void setupClass() throws IOException {
tempFolder = Files.createTempDirectory("BigQueryIOTest");
}
@Before
public void setUp() throws IOException {
tables = HashBasedTable.create();
BigQueryIO.clearCreatedTables();
}
@Test
public void testBuildTableBasedSource() {
BigQueryIO.Read read = BigQueryIO.read().from("foo.com:project:somedataset.sometable");
checkReadTableObject(read, "foo.com:project", "somedataset", "sometable");
}
@Test
public void testBuildQueryBasedSource() {
BigQueryIO.Read read = BigQueryIO.read().fromQuery("foo_query");
checkReadQueryObject(read, "foo_query");
}
@Test
public void testBuildTableBasedSourceWithoutValidation() {
// This test just checks that using withoutValidation will not trigger object
// construction errors.
BigQueryIO.Read read =
BigQueryIO.read().from("foo.com:project:somedataset.sometable").withoutValidation();
checkReadTableObjectWithValidate(read, "foo.com:project", "somedataset", "sometable", false);
}
@Test
public void testBuildQueryBasedSourceWithoutValidation() {
// This test just checks that using withoutValidation will not trigger object
// construction errors.
BigQueryIO.Read read =
BigQueryIO.read().fromQuery("some_query").withoutValidation();
checkReadQueryObjectWithValidate(read, "some_query", false);
}
@Test
public void testBuildTableBasedSourceWithDefaultProject() {
BigQueryIO.Read read =
BigQueryIO.read().from("somedataset.sometable");
checkReadTableObject(read, null, "somedataset", "sometable");
}
@Test
public void testBuildSourceWithTableReference() {
TableReference table = new TableReference()
.setProjectId("foo.com:project")
.setDatasetId("somedataset")
.setTableId("sometable");
BigQueryIO.Read read = BigQueryIO.read().from(table);
checkReadTableObject(read, "foo.com:project", "somedataset", "sometable");
}
@Test
public void testValidateReadSetsDefaultProject() throws Exception {
String projectId = "someproject";
String datasetId = "somedataset";
String tableId = "sometable";
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
bqOptions.setProject(projectId);
Path baseDir = Files.createTempDirectory(tempFolder, "testValidateReadSetsDefaultProject");
bqOptions.setTempLocation(baseDir.toString());
FakeDatasetService fakeDatasetService = new FakeDatasetService();
fakeDatasetService.createDataset(projectId, datasetId, "", "");
TableReference tableReference =
new TableReference().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId);
fakeDatasetService.createTable(new Table()
.setTableReference(tableReference)
.setSchema(new TableSchema()
.setFields(
ImmutableList.of(
new TableFieldSchema().setName("name").setType("STRING"),
new TableFieldSchema().setName("number").setType("INTEGER")))));
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(new FakeJobService())
.withDatasetService(fakeDatasetService);
List<TableRow> expected = ImmutableList.of(
new TableRow().set("name", "a").set("number", 1L),
new TableRow().set("name", "b").set("number", 2L),
new TableRow().set("name", "c").set("number", 3L),
new TableRow().set("name", "d").set("number", 4L),
new TableRow().set("name", "e").set("number", 5L),
new TableRow().set("name", "f").set("number", 6L));
fakeDatasetService.insertAll(tableReference, expected, null);
Pipeline p = TestPipeline.create(bqOptions);
TableReference tableRef = new TableReference();
tableRef.setDatasetId(datasetId);
tableRef.setTableId(tableId);
PCollection<KV<String, Long>> output =
p.apply(BigQueryIO.read().from(tableRef).withTestServices(fakeBqServices))
.apply(ParDo.of(new DoFn<TableRow, KV<String, Long>>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
c.output(KV.of((String) c.element().get("name"),
Long.valueOf((String) c.element().get("number"))));
}
}));
PAssert.that(output).containsInAnyOrder(ImmutableList.of(KV.of("a", 1L), KV.of("b", 2L),
KV.of("c", 3L), KV.of("d", 4L), KV.of("e", 5L), KV.of("f", 6L)));
p.run();
}
@Test
public void testBuildSourceWithTableAndFlatten() {
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
bqOptions.setProject("defaultproject");
bqOptions.setTempLocation("gs://testbucket/testdir");
Pipeline p = TestPipeline.create(bqOptions);
thrown.expect(IllegalStateException.class);
thrown.expectMessage(
"Invalid BigQueryIO.Read: Specifies a table with a result flattening preference,"
+ " which only applies to queries");
p.apply("ReadMyTable",
BigQueryIO.read()
.from("foo.com:project:somedataset.sometable")
.withoutResultFlattening());
p.run();
}
@Test
public void testBuildSourceWithTableAndFlattenWithoutValidation() {
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
bqOptions.setProject("defaultproject");
bqOptions.setTempLocation("gs://testbucket/testdir");
Pipeline p = TestPipeline.create(bqOptions);
thrown.expect(IllegalStateException.class);
thrown.expectMessage(
"Invalid BigQueryIO.Read: Specifies a table with a result flattening preference,"
+ " which only applies to queries");
p.apply(
BigQueryIO.read()
.from("foo.com:project:somedataset.sometable")
.withoutValidation()
.withoutResultFlattening());
p.run();
}
@Test
public void testBuildSourceWithTableAndSqlDialect() {
BigQueryOptions bqOptions = PipelineOptionsFactory.as(BigQueryOptions.class);
bqOptions.setProject("defaultproject");
bqOptions.setTempLocation("gs://testbucket/testdir");
Pipeline p = TestPipeline.create(bqOptions);
thrown.expect(IllegalStateException.class);
thrown.expectMessage(
"Invalid BigQueryIO.Read: Specifies a table with a SQL dialect preference,"
+ " which only applies to queries");
p.apply(
BigQueryIO.read()
.from("foo.com:project:somedataset.sometable")
.usingStandardSql());
p.run();
}
@Test
public void testReadFromTable() throws IOException, InterruptedException {
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
bqOptions.setProject("defaultproject");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
Job job = new Job();
JobStatus status = new JobStatus();
job.setStatus(status);
JobStatistics jobStats = new JobStatistics();
job.setStatistics(jobStats);
JobStatistics4 extract = new JobStatistics4();
jobStats.setExtract(extract);
extract.setDestinationUriFileCounts(ImmutableList.of(1L));
Table sometable = new Table();
sometable.setSchema(
new TableSchema()
.setFields(
ImmutableList.of(
new TableFieldSchema().setName("name").setType("STRING"),
new TableFieldSchema().setName("number").setType("INTEGER"))));
sometable.setTableReference(
new TableReference()
.setProjectId("non-executing-project")
.setDatasetId("somedataset")
.setTableId("sometable"));
sometable.setNumBytes(1024L * 1024L);
FakeDatasetService fakeDatasetService = new FakeDatasetService();
fakeDatasetService.createDataset("non-executing-project", "somedataset", "", "");
fakeDatasetService.createTable(sometable);
List<TableRow> records = Lists.newArrayList(
new TableRow().set("name", "a").set("number", 1L),
new TableRow().set("name", "b").set("number", 2L),
new TableRow().set("name", "c").set("number", 3L));
fakeDatasetService.insertAll(sometable.getTableReference(), records, null);
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(new FakeJobService())
.withDatasetService(fakeDatasetService);
Pipeline p = TestPipeline.create(bqOptions);
PCollection<KV<String, Long>> output = p
.apply(BigQueryIO.read().from("non-executing-project:somedataset.sometable")
.withTestServices(fakeBqServices)
.withoutValidation())
.apply(ParDo.of(new DoFn<TableRow, KV<String, Long>>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
c.output(KV.of((String) c.element().get("name"),
Long.valueOf((String) c.element().get("number"))));
}
}));
PAssert.that(output)
.containsInAnyOrder(ImmutableList.of(KV.of("a", 1L), KV.of("b", 2L), KV.of("c", 3L)));
p.run();
}
// Create an intermediate type to ensure that coder inference up the inheritance tree is tested.
abstract static class StringIntegerDestinations extends DynamicDestinations<String, Integer> {
}
@Test
public void testWriteDynamicDestinationsBatch() throws Exception {
writeDynamicDestinations(false);
}
@Test
public void testWriteDynamicDestinationsStreaming() throws Exception {
writeDynamicDestinations(true);
}
public void writeDynamicDestinations(boolean streaming) throws Exception {
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
bqOptions.setProject("project-id");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
FakeDatasetService datasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(new FakeJobService())
.withDatasetService(datasetService);
datasetService.createDataset("project-id", "dataset-id", "", "");
final Pattern userPattern = Pattern.compile("([a-z]+)([0-9]+)");
Pipeline p = TestPipeline.create(bqOptions);
final PCollectionView<List<String>> sideInput1 =
p.apply("Create SideInput 1", Create.of("a", "b", "c").withCoder(StringUtf8Coder.of()))
.apply("asList", View.<String>asList());
final PCollectionView<Map<String, String>> sideInput2 =
p.apply("Create SideInput2", Create.of(KV.of("a", "a"), KV.of("b", "b"), KV.of("c", "c")))
.apply("AsMap", View.<String, String>asMap());
final List<String> allUsernames = ImmutableList.of("bill", "bob", "randolph");
List<String> userList = Lists.newArrayList();
// Make sure that we generate enough users so that WriteBundlesToFiles is forced to spill to
// WriteGroupedRecordsToFiles.
for (int i = 0; i < BatchLoads.DEFAULT_MAX_NUM_WRITERS_PER_BUNDLE * 10; ++i) {
// Every user has 10 nicknames.
for (int j = 0; j < 1; ++j) {
String nickname = allUsernames.get(
ThreadLocalRandom.current().nextInt(allUsernames.size()));
userList.add(nickname + i);
}
}
PCollection<String> users = p.apply("CreateUsers", Create.of(userList))
.apply(Window.into(new PartitionedGlobalWindows<>(
new SerializableFunction<String, String>() {
@Override
public String apply(String arg) {
return arg;
}
})));
if (streaming) {
users = users.setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED);
}
users.apply("WriteBigQuery", BigQueryIO.<String>write()
.withTestServices(fakeBqServices)
.withMaxFilesPerBundle(5)
.withMaxFileSize(10)
.withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
.withFormatFunction(new SerializableFunction<String, TableRow>() {
@Override
public TableRow apply(String user) {
Matcher matcher = userPattern.matcher(user);
if (matcher.matches()) {
return new TableRow().set("name", matcher.group(1))
.set("id", Integer.valueOf(matcher.group(2)));
}
throw new RuntimeException("Unmatching element " + user);
}
})
.to(new StringIntegerDestinations() {
@Override
public Integer getDestination(ValueInSingleWindow<String> element) {
assertThat(element.getWindow(), Matchers.instanceOf(PartitionedGlobalWindow.class));
Matcher matcher = userPattern.matcher(element.getValue());
if (matcher.matches()) {
// Since we name tables by userid, we can simply store an Integer to represent
// a table.
return Integer.valueOf(matcher.group(2));
}
throw new RuntimeException("Unmatching destination " + element.getValue());
}
@Override
public TableDestination getTable(Integer userId) {
verifySideInputs();
// Each user in it's own table.
return new TableDestination("dataset-id.userid-" + userId,
"table for userid " + userId);
}
@Override
public TableSchema getSchema(Integer userId) {
verifySideInputs();
return new TableSchema().setFields(
ImmutableList.of(
new TableFieldSchema().setName("name").setType("STRING"),
new TableFieldSchema().setName("id").setType("INTEGER")));
}
@Override
public List<PCollectionView<?>> getSideInputs() {
return ImmutableList.of(sideInput1, sideInput2);
}
private void verifySideInputs() {
assertThat(sideInput(sideInput1), containsInAnyOrder("a", "b", "c"));
Map<String, String> mapSideInput = sideInput(sideInput2);
assertEquals(3, mapSideInput.size());
assertThat(mapSideInput,
allOf(hasEntry("a", "a"), hasEntry("b", "b"), hasEntry("c", "c")));
}
})
.withoutValidation());
p.run();
File tempDir = new File(bqOptions.getTempLocation());
testNumFiles(tempDir, 0);
Map<Integer, List<TableRow>> expectedTableRows = Maps.newHashMap();
for (int i = 0; i < userList.size(); ++i) {
Matcher matcher = userPattern.matcher(userList.get(i));
checkState(matcher.matches());
String nickname = matcher.group(1);
int userid = Integer.valueOf(matcher.group(2));
List<TableRow> expected = expectedTableRows.get(userid);
if (expected == null) {
expected = Lists.newArrayList();
expectedTableRows.put(userid, expected);
}
expected.add(new TableRow().set("name", nickname).set("id", userid));
}
for (Map.Entry<Integer, List<TableRow>> entry : expectedTableRows.entrySet()) {
assertThat(datasetService.getAllRows("project-id", "dataset-id", "userid-" + entry.getKey()),
containsInAnyOrder(Iterables.toArray(entry.getValue(), TableRow.class)));
}
}
@Test
public void testWrite() throws Exception {
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
bqOptions.setProject("defaultproject");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
FakeDatasetService datasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(new FakeJobService())
.withDatasetService(datasetService);
datasetService.createDataset("defaultproject", "dataset-id", "", "");
Pipeline p = TestPipeline.create(bqOptions);
p.apply(Create.of(
new TableRow().set("name", "a").set("number", 1),
new TableRow().set("name", "b").set("number", 2),
new TableRow().set("name", "c").set("number", 3))
.withCoder(TableRowJsonCoder.of()))
.apply(BigQueryIO.writeTableRows().to("dataset-id.table-id")
.withTableDescription(null)
.withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
.withSchema(new TableSchema().setFields(
ImmutableList.of(
new TableFieldSchema().setName("name").setType("STRING"),
new TableFieldSchema().setName("number").setType("INTEGER"))))
.withTestServices(fakeBqServices)
.withoutValidation());
p.run();
File tempDir = new File(bqOptions.getTempLocation());
testNumFiles(tempDir, 0);
}
@Test
public void testStreamingWrite() throws Exception {
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
bqOptions.setProject("defaultproject");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
FakeDatasetService datasetService = new FakeDatasetService();
datasetService.createDataset("project-id", "dataset-id", "", "");
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withDatasetService(datasetService);
Pipeline p = TestPipeline.create(bqOptions);
p.apply(Create.of(
new TableRow().set("name", "a").set("number", 1),
new TableRow().set("name", "b").set("number", 2),
new TableRow().set("name", "c").set("number", 3),
new TableRow().set("name", "d").set("number", 4))
.withCoder(TableRowJsonCoder.of()))
.setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED)
.apply(BigQueryIO.writeTableRows().to("project-id:dataset-id.table-id")
.withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
.withSchema(new TableSchema().setFields(
ImmutableList.of(
new TableFieldSchema().setName("name").setType("STRING"),
new TableFieldSchema().setName("number").setType("INTEGER"))))
.withTestServices(fakeBqServices)
.withoutValidation());
p.run();
assertThat(datasetService.getAllRows("project-id", "dataset-id", "table-id"),
containsInAnyOrder(
new TableRow().set("name", "a").set("number", 1),
new TableRow().set("name", "b").set("number", 2),
new TableRow().set("name", "c").set("number", 3),
new TableRow().set("name", "d").set("number", 4)));
}
/**
* A generic window function that allows partitioning data into windows by a string value.
*
* <p>Logically, creates multiple global windows, and the user provides a function that
* decides which global window a value should go into.
*/
private static class PartitionedGlobalWindows<T> extends
NonMergingWindowFn<T, PartitionedGlobalWindow> {
private SerializableFunction<T, String> extractPartition;
public PartitionedGlobalWindows(SerializableFunction<T, String> extractPartition) {
this.extractPartition = extractPartition;
}
@Override
public Collection<PartitionedGlobalWindow> assignWindows(AssignContext c) {
return Collections.singletonList(new PartitionedGlobalWindow(
extractPartition.apply(c.element())));
}
@Override
public boolean isCompatible(WindowFn<?, ?> o) {
return o instanceof PartitionedGlobalWindows;
}
@Override
public void verifyCompatibility(WindowFn<?, ?> other) throws IncompatibleWindowException {
if (!this.isCompatible(other)) {
throw new IncompatibleWindowException(
other,
String.format(
"%s is only compatible with %s.",
PartitionedGlobalWindows.class.getSimpleName(),
PartitionedGlobalWindows.class.getSimpleName()));
}
}
@Override
public Coder<PartitionedGlobalWindow> windowCoder() {
return new PartitionedGlobalWindowCoder();
}
@Override
public WindowMappingFn<PartitionedGlobalWindow> getDefaultWindowMappingFn() {
throw new UnsupportedOperationException(
"PartitionedGlobalWindows is not allowed in side inputs");
}
@Override
public Instant getOutputTime(Instant inputTimestamp, PartitionedGlobalWindow window) {
return inputTimestamp;
}
}
/**
* Custom Window object that encodes a String value.
*/
private static class PartitionedGlobalWindow extends BoundedWindow {
String value;
public PartitionedGlobalWindow(String value) {
this.value = value;
}
@Override
public Instant maxTimestamp() {
return GlobalWindow.INSTANCE.maxTimestamp();
}
@Override
public boolean equals(Object other) {
if (other instanceof PartitionedGlobalWindow) {
return value.equals(((PartitionedGlobalWindow) other).value);
}
return false;
}
@Override
public int hashCode() {
return value.hashCode();
}
}
/**
* Coder for @link{PartitionedGlobalWindow}.
*/
private static class PartitionedGlobalWindowCoder extends AtomicCoder<PartitionedGlobalWindow> {
@Override
public void encode(PartitionedGlobalWindow window, OutputStream outStream)
throws IOException, CoderException {
encode(window, outStream, Context.NESTED);
}
@Override
public void encode(PartitionedGlobalWindow window, OutputStream outStream, Context context)
throws IOException, CoderException {
StringUtf8Coder.of().encode(window.value, outStream, context);
}
@Override
public PartitionedGlobalWindow decode(InputStream inStream) throws IOException, CoderException {
return decode(inStream, Context.NESTED);
}
@Override
public PartitionedGlobalWindow decode(InputStream inStream, Context context)
throws IOException, CoderException {
return new PartitionedGlobalWindow(StringUtf8Coder.of().decode(inStream, context));
}
@Override
public void verifyDeterministic() {}
}
@Test
public void testStreamingWriteWithDynamicTables() throws Exception {
testWriteWithDynamicTables(true);
}
@Test
public void testBatchWriteWithDynamicTables() throws Exception {
testWriteWithDynamicTables(false);
}
public void testWriteWithDynamicTables(boolean streaming) throws Exception {
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
bqOptions.setProject("defaultproject");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
FakeDatasetService datasetService = new FakeDatasetService();
datasetService.createDataset("project-id", "dataset-id", "", "");
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withDatasetService(datasetService)
.withJobService(new FakeJobService());
List<Integer> inserts = new ArrayList<>();
for (int i = 0; i < 10; i++) {
inserts.add(i);
}
// Create a windowing strategy that puts the input into five different windows depending on
// record value.
WindowFn<Integer, PartitionedGlobalWindow> windowFn = new PartitionedGlobalWindows(
new SerializableFunction<Integer, String>() {
@Override
public String apply(Integer i) {
return Integer.toString(i % 5);
}
}
);
final Map<Integer, TableDestination> targetTables = Maps.newHashMap();
Map<String, String> schemas = Maps.newHashMap();
for (int i = 0; i < 5; i++) {
TableDestination destination = new TableDestination("project-id:dataset-id"
+ ".table-id-" + i, "");
targetTables.put(i, destination);
// Make sure each target table has its own custom table.
schemas.put(destination.getTableSpec(),
BigQueryHelpers.toJsonString(new TableSchema().setFields(
ImmutableList.of(
new TableFieldSchema().setName("name").setType("STRING"),
new TableFieldSchema().setName("number").setType("INTEGER"),
new TableFieldSchema().setName("custom_" + i).setType("STRING")))));
}
SerializableFunction<ValueInSingleWindow<Integer>, TableDestination> tableFunction =
new SerializableFunction<ValueInSingleWindow<Integer>, TableDestination>() {
@Override
public TableDestination apply(ValueInSingleWindow<Integer> input) {
PartitionedGlobalWindow window = (PartitionedGlobalWindow) input.getWindow();
// Check that we can access the element as well here and that it matches the window.
checkArgument(window.value.equals(Integer.toString(input.getValue() % 5)),
"Incorrect element");
return targetTables.get(input.getValue() % 5);
}
};
Pipeline p = TestPipeline.create(bqOptions);
PCollection<Integer> input = p.apply("CreateSource", Create.of(inserts));
if (streaming) {
input = input.setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED);
}
PCollectionView<Map<String, String>> schemasView =
p.apply("CreateSchemaMap", Create.of(schemas))
.apply("ViewSchemaAsMap", View.<String, String>asMap());
input.apply(Window.<Integer>into(windowFn))
.apply(BigQueryIO.<Integer>write()
.to(tableFunction)
.withFormatFunction(new SerializableFunction<Integer, TableRow>() {
@Override
public TableRow apply(Integer i) {
return new TableRow().set("name", "number" + i).set("number", i);
}})
.withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
.withSchemaFromView(schemasView)
.withTestServices(fakeBqServices)
.withoutValidation());
p.run();
for (int i = 0; i < 5; ++i) {
String tableId = String.format("table-id-%d", i);
String tableSpec = String.format("project-id:dataset-id.%s", tableId);
// Verify that table was created with the correct schema.
assertThat(BigQueryHelpers.toJsonString(
datasetService.getTable(new TableReference().setProjectId("project-id")
.setDatasetId("dataset-id").setTableId(tableId)).getSchema()),
equalTo(schemas.get(tableSpec)));
// Verify that the table has the expected contents.
assertThat(datasetService.getAllRows("project-id", "dataset-id", tableId),
containsInAnyOrder(
new TableRow().set("name", String.format("number%d", i)).set("number", i),
new TableRow().set("name", String.format("number%d", i + 5)).set("number", i + 5)));
}
}
@Test
public void testWriteUnknown() throws Exception {
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
bqOptions.setProject("defaultproject");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
FakeDatasetService datasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(new FakeJobService())
.withDatasetService(datasetService);
datasetService.createDataset("project-id", "dataset-id", "", "");
Pipeline p = TestPipeline.create(bqOptions);
p.apply(Create.of(
new TableRow().set("name", "a").set("number", 1),
new TableRow().set("name", "b").set("number", 2),
new TableRow().set("name", "c").set("number", 3))
.withCoder(TableRowJsonCoder.of()))
.apply(BigQueryIO.writeTableRows().to("project-id:dataset-id.table-id")
.withCreateDisposition(CreateDisposition.CREATE_NEVER)
.withTestServices(fakeBqServices)
.withoutValidation());
thrown.expect(RuntimeException.class);
thrown.expectMessage("Failed to create load job");
try {
p.run();
} finally {
File tempDir = new File(bqOptions.getTempLocation());
testNumFiles(tempDir, 0);
}
}
@Test
public void testWriteFailedJobs() throws Exception {
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
bqOptions.setProject("defaultproject");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
FakeDatasetService datasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(new FakeJobService())
.withDatasetService(datasetService);
Pipeline p = TestPipeline.create(bqOptions);
p.apply(Create.of(
new TableRow().set("name", "a").set("number", 1),
new TableRow().set("name", "b").set("number", 2),
new TableRow().set("name", "c").set("number", 3))
.withCoder(TableRowJsonCoder.of()))
.apply(BigQueryIO.writeTableRows().to("dataset-id.table-id")
.withCreateDisposition(CreateDisposition.CREATE_NEVER)
.withTestServices(fakeBqServices)
.withoutValidation());
thrown.expect(RuntimeException.class);
thrown.expectMessage("Failed to create load job with id prefix");
thrown.expectMessage("reached max retries");
thrown.expectMessage("last failed load job");
try {
p.run();
} finally {
File tempDir = new File(bqOptions.getTempLocation());
testNumFiles(tempDir, 0);
}
}
@Test
public void testBuildSourceDisplayDataTable() {
String tableSpec = "project:dataset.tableid";
BigQueryIO.Read read = BigQueryIO.read()
.from(tableSpec)
.withoutResultFlattening()
.usingStandardSql()
.withoutValidation();
DisplayData displayData = DisplayData.from(read);
assertThat(displayData, hasDisplayItem("table", tableSpec));
assertThat(displayData, hasDisplayItem("flattenResults", false));
assertThat(displayData, hasDisplayItem("useLegacySql", false));
assertThat(displayData, hasDisplayItem("validation", false));
}
@Test
public void testBuildSourceDisplayDataQuery() {
BigQueryIO.Read read = BigQueryIO.read()
.fromQuery("myQuery")
.withoutResultFlattening()
.usingStandardSql()
.withoutValidation();
DisplayData displayData = DisplayData.from(read);
assertThat(displayData, hasDisplayItem("query", "myQuery"));
assertThat(displayData, hasDisplayItem("flattenResults", false));
assertThat(displayData, hasDisplayItem("useLegacySql", false));
assertThat(displayData, hasDisplayItem("validation", false));
}
@Test
public void testTableSourcePrimitiveDisplayData() throws IOException, InterruptedException {
DisplayDataEvaluator evaluator = DisplayDataEvaluator.create();
BigQueryIO.Read read = BigQueryIO.read()
.from("project:dataset.tableId")
.withTestServices(new FakeBigQueryServices()
.withDatasetService(new FakeDatasetService())
.withJobService(new FakeJobService()))
.withoutValidation();
Set<DisplayData> displayData = evaluator.displayDataForPrimitiveSourceTransforms(read);
assertThat("BigQueryIO.Read should include the table spec in its primitive display data",
displayData, hasItem(hasDisplayItem("table")));
}
@Test
public void testQuerySourcePrimitiveDisplayData() throws IOException, InterruptedException {
DisplayDataEvaluator evaluator = DisplayDataEvaluator.create();
BigQueryIO.Read read = BigQueryIO.read()
.fromQuery("foobar")
.withTestServices(new FakeBigQueryServices()
.withDatasetService(new FakeDatasetService())
.withJobService(new FakeJobService()))
.withoutValidation();
Set<DisplayData> displayData = evaluator.displayDataForPrimitiveSourceTransforms(read);
assertThat("BigQueryIO.Read should include the query in its primitive display data",
displayData, hasItem(hasDisplayItem("query")));
}
@Test
public void testBuildWrite() {
BigQueryIO.Write<TableRow> write =
BigQueryIO.writeTableRows().to("foo.com:project:somedataset.sometable");
checkWriteObject(
write, "foo.com:project", "somedataset", "sometable",
null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY, null);
}
@Test
public void testBuildWriteWithoutValidation() {
// This test just checks that using withoutValidation will not trigger object
// construction errors.
BigQueryIO.Write write =
BigQueryIO.<TableRow>write().to("foo.com:project:somedataset.sometable")
.withoutValidation();
checkWriteObjectWithValidate(
write,
"foo.com:project",
"somedataset",
"sometable",
null,
CreateDisposition.CREATE_IF_NEEDED,
WriteDisposition.WRITE_EMPTY,
null,
false);
}
@Test
public void testBuildWriteDefaultProject() {
BigQueryIO.Write<TableRow> write = BigQueryIO.writeTableRows()
.to("somedataset" + ".sometable");
checkWriteObject(
write, null, "somedataset", "sometable",
null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY,
null);
}
@Test
public void testBuildWriteWithTableReference() {
TableReference table = new TableReference()
.setProjectId("foo.com:project")
.setDatasetId("somedataset")
.setTableId("sometable");
BigQueryIO.Write<TableRow> write = BigQueryIO.writeTableRows().to(table);
checkWriteObject(
write, "foo.com:project", "somedataset", "sometable",
null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY, null);
}
@Test
public void testBuildWriteWithSchema() {
TableSchema schema = new TableSchema();
BigQueryIO.Write<TableRow> write =
BigQueryIO.<TableRow>write().to("foo.com:project:somedataset.sometable").withSchema(schema);
checkWriteObject(
write, "foo.com:project", "somedataset", "sometable",
schema, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY, null);
}
@Test
public void testBuildWriteWithCreateDispositionNever() {
BigQueryIO.Write<TableRow> write = BigQueryIO.<TableRow>write()
.to("foo.com:project:somedataset.sometable")
.withCreateDisposition(CreateDisposition.CREATE_NEVER);
checkWriteObject(
write, "foo.com:project", "somedataset", "sometable",
null, CreateDisposition.CREATE_NEVER, WriteDisposition.WRITE_EMPTY, null);
}
@Test
public void testBuildWriteWithCreateDispositionIfNeeded() {
BigQueryIO.Write<TableRow> write = BigQueryIO.writeTableRows()
.to("foo.com:project:somedataset.sometable")
.withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED);
checkWriteObject(
write, "foo.com:project", "somedataset", "sometable",
null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY, null);
}
@Test
public void testBuildWriteWithWriteDispositionTruncate() {
BigQueryIO.Write<TableRow> write = BigQueryIO.<TableRow>write()
.to("foo.com:project:somedataset.sometable")
.withWriteDisposition(WriteDisposition.WRITE_TRUNCATE);
checkWriteObject(
write, "foo.com:project", "somedataset", "sometable",
null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_TRUNCATE, null);
}
@Test
public void testBuildWriteWithWriteDispositionAppend() {
BigQueryIO.Write<TableRow> write = BigQueryIO.writeTableRows()
.to("foo.com:project:somedataset.sometable")
.withWriteDisposition(WriteDisposition.WRITE_APPEND);
checkWriteObject(
write, "foo.com:project", "somedataset", "sometable",
null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_APPEND, null);
}
@Test
public void testBuildWriteWithWriteDispositionEmpty() {
BigQueryIO.Write<TableRow> write = BigQueryIO.<TableRow>write()
.to("foo.com:project:somedataset.sometable")
.withWriteDisposition(WriteDisposition.WRITE_EMPTY);
checkWriteObject(
write, "foo.com:project", "somedataset", "sometable",
null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY, null);
}
@Test
public void testBuildWriteWithWriteWithTableDescription() {
final String tblDescription = "foo bar table";
BigQueryIO.Write<TableRow> write = BigQueryIO.writeTableRows()
.to("foo.com:project:somedataset.sometable")
.withTableDescription(tblDescription);
checkWriteObject(
write,
"foo.com:project",
"somedataset",
"sometable",
null,
CreateDisposition.CREATE_IF_NEEDED,
WriteDisposition.WRITE_EMPTY,
tblDescription);
}
@Test
public void testBuildWriteDisplayData() {
String tableSpec = "project:dataset.table";
TableSchema schema = new TableSchema().set("col1", "type1").set("col2", "type2");
final String tblDescription = "foo bar table";
BigQueryIO.Write<TableRow> write = BigQueryIO.writeTableRows()
.to(tableSpec)
.withSchema(schema)
.withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
.withWriteDisposition(WriteDisposition.WRITE_APPEND)
.withTableDescription(tblDescription)
.withoutValidation();
DisplayData displayData = DisplayData.from(write);
assertThat(displayData, hasDisplayItem("table"));
assertThat(displayData, hasDisplayItem("schema"));
assertThat(displayData,
hasDisplayItem("createDisposition", CreateDisposition.CREATE_IF_NEEDED.toString()));
assertThat(displayData,
hasDisplayItem("writeDisposition", WriteDisposition.WRITE_APPEND.toString()));
assertThat(displayData,
hasDisplayItem("tableDescription", tblDescription));
assertThat(displayData, hasDisplayItem("validation", false));
}
private void testWriteValidatesDataset(boolean unbounded) throws Exception {
String projectId = "someproject";
String datasetId = "somedataset";
BigQueryOptions options = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
options.setProject(projectId);
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(new FakeJobService())
.withDatasetService(new FakeDatasetService());
Pipeline p = TestPipeline.create(options);
TableReference tableRef = new TableReference();
tableRef.setDatasetId(datasetId);
tableRef.setTableId("sometable");
PCollection<TableRow> tableRows;
if (unbounded) {
tableRows =
p.apply(GenerateSequence.from(0))
.apply(
MapElements.via(
new SimpleFunction<Long, TableRow>() {
@Override
public TableRow apply(Long input) {
return null;
}
}))
.setCoder(TableRowJsonCoder.of());
} else {
tableRows = p
.apply(Create.empty(TableRowJsonCoder.of()));
}
thrown.expect(RuntimeException.class);
// Message will be one of following depending on the execution environment.
thrown.expectMessage(
Matchers.either(Matchers.containsString("Unable to confirm BigQuery dataset presence"))
.or(Matchers.containsString("BigQuery dataset not found for table")));
tableRows
.apply(
BigQueryIO.writeTableRows().to(tableRef)
.withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
.withSchema(new TableSchema())
.withTestServices(fakeBqServices));
p.run();
}
@Test
public void testWriteValidatesDatasetBatch() throws Exception {
testWriteValidatesDataset(false);
}
@Test
public void testWriteValidatesDatasetStreaming() throws Exception {
testWriteValidatesDataset(true);
}
@Test
public void testCreateNeverWithStreaming() throws Exception {
BigQueryOptions options = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
options.setProject("project");
options.setStreaming(true);
Pipeline p = TestPipeline.create(options);
TableReference tableRef = new TableReference();
tableRef.setDatasetId("dataset");
tableRef.setTableId("sometable");
PCollection<TableRow> tableRows =
p.apply(GenerateSequence.from(0))
.apply(
MapElements.via(
new SimpleFunction<Long, TableRow>() {
@Override
public TableRow apply(Long input) {
return null;
}
}))
.setCoder(TableRowJsonCoder.of());
tableRows
.apply(BigQueryIO.writeTableRows().to(tableRef)
.withCreateDisposition(CreateDisposition.CREATE_NEVER)
.withoutValidation());
}
@Test
public void testTableParsing() {
TableReference ref = BigQueryHelpers
.parseTableSpec("my-project:data_set.table_name");
Assert.assertEquals("my-project", ref.getProjectId());
Assert.assertEquals("data_set", ref.getDatasetId());
Assert.assertEquals("table_name", ref.getTableId());
}
@Test
public void testTableParsing_validPatterns() {
BigQueryHelpers.parseTableSpec("a123-456:foo_bar.d");
BigQueryHelpers.parseTableSpec("a12345:b.c");
BigQueryHelpers.parseTableSpec("b12345.c");
}
@Test
public void testTableParsing_noProjectId() {
TableReference ref = BigQueryHelpers
.parseTableSpec("data_set.table_name");
Assert.assertEquals(null, ref.getProjectId());
Assert.assertEquals("data_set", ref.getDatasetId());
Assert.assertEquals("table_name", ref.getTableId());
}
@Test
public void testTableParsingError() {
thrown.expect(IllegalArgumentException.class);
BigQueryHelpers.parseTableSpec("0123456:foo.bar");
}
@Test
public void testTableParsingError_2() {
thrown.expect(IllegalArgumentException.class);
BigQueryHelpers.parseTableSpec("myproject:.bar");
}
@Test
public void testTableParsingError_3() {
thrown.expect(IllegalArgumentException.class);
BigQueryHelpers.parseTableSpec(":a.b");
}
@Test
public void testTableParsingError_slash() {
thrown.expect(IllegalArgumentException.class);
BigQueryHelpers.parseTableSpec("a\\b12345:c.d");
}
// Test that BigQuery's special null placeholder objects can be encoded.
@Test
public void testCoder_nullCell() throws CoderException {
TableRow row = new TableRow();
row.set("temperature", Data.nullOf(Object.class));
row.set("max_temperature", Data.nullOf(Object.class));
byte[] bytes = CoderUtils.encodeToByteArray(TableRowJsonCoder.of(), row);
TableRow newRow = CoderUtils.decodeFromByteArray(TableRowJsonCoder.of(), bytes);
byte[] newBytes = CoderUtils.encodeToByteArray(TableRowJsonCoder.of(), newRow);
Assert.assertArrayEquals(bytes, newBytes);
}
@Test
public void testBigQueryIOGetName() {
assertEquals("BigQueryIO.Read",
BigQueryIO.read().from("somedataset.sometable").getName());
assertEquals("BigQueryIO.Write",
BigQueryIO.<TableRow>write().to("somedataset.sometable").getName());
}
@Test
public void testWriteValidateFailsCreateNoSchema() {
p.enableAbandonedNodeEnforcement(false);
thrown.expect(IllegalArgumentException.class);
thrown.expectMessage("no schema was provided");
p
.apply(Create.empty(TableRowJsonCoder.of()))
.apply(BigQueryIO.writeTableRows()
.to("dataset.table")
.withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED));
}
@Test
public void testBigQueryTableSourceThroughJsonAPI() throws Exception {
FakeDatasetService datasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(new FakeJobService())
.withDatasetService(datasetService);
List<TableRow> expected = ImmutableList.of(
new TableRow().set("name", "a").set("number", "1"),
new TableRow().set("name", "b").set("number", "2"),
new TableRow().set("name", "c").set("number", "3"),
new TableRow().set("name", "d").set("number", "4"),
new TableRow().set("name", "e").set("number", "5"),
new TableRow().set("name", "f").set("number", "6"));
TableReference table = BigQueryHelpers.parseTableSpec("project:data_set.table_name");
datasetService.createDataset(table.getProjectId(), table.getDatasetId(), "", "");
datasetService.createTable(new Table().setTableReference(table));
datasetService.insertAll(table, expected, null);
Path baseDir = Files.createTempDirectory(tempFolder, "testBigQueryTableSourceThroughJsonAPI");
String stepUuid = "testStepUuid";
BoundedSource<TableRow> bqSource = BigQueryTableSource.create(
stepUuid, StaticValueProvider.of(table), fakeBqServices);
PipelineOptions options = PipelineOptionsFactory.create();
options.setTempLocation(baseDir.toString());
Assert.assertThat(
SourceTestUtils.readFromSource(bqSource, options),
CoreMatchers.is(expected));
SourceTestUtils.assertSplitAtFractionBehavior(
bqSource, 2, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
}
@Test
public void testBigQueryTableSourceInitSplit() throws Exception {
FakeDatasetService fakeDatasetService = new FakeDatasetService();
FakeJobService fakeJobService = new FakeJobService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(fakeJobService)
.withDatasetService(fakeDatasetService);
List<TableRow> expected = ImmutableList.of(
new TableRow().set("name", "a").set("number", 1L),
new TableRow().set("name", "b").set("number", 2L),
new TableRow().set("name", "c").set("number", 3L),
new TableRow().set("name", "d").set("number", 4L),
new TableRow().set("name", "e").set("number", 5L),
new TableRow().set("name", "f").set("number", 6L));
TableReference table = BigQueryHelpers.parseTableSpec("project:data_set.table_name");
fakeDatasetService.createDataset("project", "data_set", "", "");
fakeDatasetService.createTable(new Table().setTableReference(table)
.setSchema(new TableSchema()
.setFields(
ImmutableList.of(
new TableFieldSchema().setName("name").setType("STRING"),
new TableFieldSchema().setName("number").setType("INTEGER")))));
fakeDatasetService.insertAll(table, expected, null);
Path baseDir = Files.createTempDirectory(tempFolder, "testBigQueryTableSourceInitSplit");
String stepUuid = "testStepUuid";
BoundedSource<TableRow> bqSource = BigQueryTableSource.create(
stepUuid, StaticValueProvider.of(table), fakeBqServices);
PipelineOptions options = PipelineOptionsFactory.create();
options.setTempLocation(baseDir.toString());
BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
bqOptions.setProject("project");
List<TableRow> read = SourceTestUtils.readFromSource(bqSource, options);
assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));
SourceTestUtils.assertSplitAtFractionBehavior(
bqSource, 2, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
assertEquals(2, sources.size());
// Simulate a repeated call to split(), like a Dataflow worker will sometimes do.
sources = bqSource.split(200, options);
assertEquals(2, sources.size());
BoundedSource<TableRow> actual = sources.get(0);
assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
// A repeated call to split() should not have caused a duplicate extract job.
assertEquals(1, fakeJobService.getNumExtractJobCalls());
}
@Test
public void testBigQueryQuerySourceInitSplit() throws Exception {
TableReference dryRunTable = new TableReference();
Job queryJob = new Job();
JobStatistics queryJobStats = new JobStatistics();
JobStatistics2 queryStats = new JobStatistics2();
queryStats.setReferencedTables(ImmutableList.of(dryRunTable));
queryJobStats.setQuery(queryStats);
queryJob.setStatus(new JobStatus())
.setStatistics(queryJobStats);
Job extractJob = new Job();
JobStatistics extractJobStats = new JobStatistics();
JobStatistics4 extractStats = new JobStatistics4();
extractStats.setDestinationUriFileCounts(ImmutableList.of(1L));
extractJobStats.setExtract(extractStats);
extractJob.setStatus(new JobStatus())
.setStatistics(extractJobStats);
FakeJobService fakeJobService = new FakeJobService();
FakeDatasetService fakeDatasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(fakeJobService)
.withDatasetService(fakeDatasetService);
List<TableRow> expected = ImmutableList.of(
new TableRow().set("name", "a").set("number", 1L),
new TableRow().set("name", "b").set("number", 2L),
new TableRow().set("name", "c").set("number", 3L),
new TableRow().set("name", "d").set("number", 4L),
new TableRow().set("name", "e").set("number", 5L),
new TableRow().set("name", "f").set("number", 6L));
PipelineOptions options = PipelineOptionsFactory.create();
BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
bqOptions.setProject("project");
String stepUuid = "testStepUuid";
TableReference tempTableReference = createTempTableReference(
bqOptions.getProject(), createJobIdToken(bqOptions.getJobName(), stepUuid));
fakeDatasetService.createDataset(
bqOptions.getProject(), tempTableReference.getDatasetId(), "", "");
fakeDatasetService.createTable(new Table()
.setTableReference(tempTableReference)
.setSchema(new TableSchema()
.setFields(
ImmutableList.of(
new TableFieldSchema().setName("name").setType("STRING"),
new TableFieldSchema().setName("number").setType("INTEGER")))));
Path baseDir = Files.createTempDirectory(tempFolder, "testBigQueryQuerySourceInitSplit");
String query = FakeBigQueryServices.encodeQuery(expected);
BoundedSource<TableRow> bqSource = BigQueryQuerySource.create(
stepUuid, StaticValueProvider.of(query),
true /* flattenResults */, true /* useLegacySql */, fakeBqServices);
options.setTempLocation(baseDir.toString());
TableReference queryTable = new TableReference()
.setProjectId(bqOptions.getProject())
.setDatasetId(tempTableReference.getDatasetId())
.setTableId(tempTableReference.getTableId());
fakeJobService.expectDryRunQuery(bqOptions.getProject(), query,
new JobStatistics().setQuery(
new JobStatistics2()
.setTotalBytesProcessed(100L)
.setReferencedTables(ImmutableList.of(queryTable))));
List<TableRow> read = SourceTestUtils.readFromSource(bqSource, options);
assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));
SourceTestUtils.assertSplitAtFractionBehavior(
bqSource, 2, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
assertEquals(2, sources.size());
BoundedSource<TableRow> actual = sources.get(0);
assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
}
@Test
public void testBigQueryNoTableQuerySourceInitSplit() throws Exception {
TableReference dryRunTable = new TableReference();
Job queryJob = new Job();
JobStatistics queryJobStats = new JobStatistics();
JobStatistics2 queryStats = new JobStatistics2();
queryStats.setReferencedTables(ImmutableList.of(dryRunTable));
queryJobStats.setQuery(queryStats);
queryJob.setStatus(new JobStatus())
.setStatistics(queryJobStats);
Job extractJob = new Job();
JobStatistics extractJobStats = new JobStatistics();
JobStatistics4 extractStats = new JobStatistics4();
extractStats.setDestinationUriFileCounts(ImmutableList.of(1L));
extractJobStats.setExtract(extractStats);
extractJob.setStatus(new JobStatus())
.setStatistics(extractJobStats);
FakeDatasetService datasetService = new FakeDatasetService();
FakeJobService jobService = new FakeJobService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(jobService)
.withDatasetService(datasetService);
PipelineOptions options = PipelineOptionsFactory.create();
BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
bqOptions.setProject("project");
String stepUuid = "testStepUuid";
TableReference tempTableReference = createTempTableReference(
bqOptions.getProject(), createJobIdToken(bqOptions.getJobName(), stepUuid));
List<TableRow> expected = ImmutableList.of(
new TableRow().set("name", "a").set("number", 1L),
new TableRow().set("name", "b").set("number", 2L),
new TableRow().set("name", "c").set("number", 3L),
new TableRow().set("name", "d").set("number", 4L),
new TableRow().set("name", "e").set("number", 5L),
new TableRow().set("name", "f").set("number", 6L));
datasetService.createDataset(
tempTableReference.getProjectId(), tempTableReference.getDatasetId(), "", "");
Table table = new Table()
.setTableReference(tempTableReference)
.setSchema(new TableSchema()
.setFields(
ImmutableList.of(
new TableFieldSchema().setName("name").setType("STRING"),
new TableFieldSchema().setName("number").setType("INTEGER"))));
datasetService.createTable(table);
String query = FakeBigQueryServices.encodeQuery(expected);
jobService.expectDryRunQuery("project", query,
new JobStatistics().setQuery(
new JobStatistics2()
.setTotalBytesProcessed(100L)
.setReferencedTables(ImmutableList.of(table.getTableReference()))));
Path baseDir = Files.createTempDirectory(
tempFolder, "testBigQueryNoTableQuerySourceInitSplit");
BoundedSource<TableRow> bqSource = BigQueryQuerySource.create(
stepUuid,
StaticValueProvider.of(query),
true /* flattenResults */, true /* useLegacySql */, fakeBqServices);
options.setTempLocation(baseDir.toString());
List<TableRow> read = convertBigDecimaslToLong(
SourceTestUtils.readFromSource(bqSource, options));
assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));
SourceTestUtils.assertSplitAtFractionBehavior(
bqSource, 2, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
assertEquals(2, sources.size());
BoundedSource<TableRow> actual = sources.get(0);
assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
}
@Test
public void testTransformingSource() throws Exception {
int numElements = 10000;
@SuppressWarnings("deprecation")
BoundedSource<Long> longSource = CountingSource.upTo(numElements);
SerializableFunction<Long, String> toStringFn =
new SerializableFunction<Long, String>() {
@Override
public String apply(Long input) {
return input.toString();
}};
BoundedSource<String> stringSource = new TransformingSource<>(
longSource, toStringFn, StringUtf8Coder.of());
List<String> expected = Lists.newArrayList();
for (int i = 0; i < numElements; i++) {
expected.add(String.valueOf(i));
}
PipelineOptions options = PipelineOptionsFactory.create();
Assert.assertThat(
SourceTestUtils.readFromSource(stringSource, options),
CoreMatchers.is(expected));
SourceTestUtils.assertSplitAtFractionBehavior(
stringSource, 100, 0.3, ExpectedSplitOutcome.MUST_SUCCEED_AND_BE_CONSISTENT, options);
SourceTestUtils.assertSourcesEqualReferenceSource(
stringSource, stringSource.split(100, options), options);
}
@Test
public void testTransformingSourceUnsplittable() throws Exception {
int numElements = 10000;
@SuppressWarnings("deprecation")
BoundedSource<Long> longSource =
SourceTestUtils.toUnsplittableSource(CountingSource.upTo(numElements));
SerializableFunction<Long, String> toStringFn =
new SerializableFunction<Long, String>() {
@Override
public String apply(Long input) {
return input.toString();
}
};
BoundedSource<String> stringSource =
new TransformingSource<>(longSource, toStringFn, StringUtf8Coder.of());
List<String> expected = Lists.newArrayList();
for (int i = 0; i < numElements; i++) {
expected.add(String.valueOf(i));
}
PipelineOptions options = PipelineOptionsFactory.create();
Assert.assertThat(
SourceTestUtils.readFromSource(stringSource, options), CoreMatchers.is(expected));
SourceTestUtils.assertSplitAtFractionBehavior(
stringSource, 100, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
SourceTestUtils.assertSourcesEqualReferenceSource(
stringSource, stringSource.split(100, options), options);
}
@Test
public void testPassThroughThenCleanup() throws Exception {
PCollection<Integer> output = p
.apply(Create.of(1, 2, 3))
.apply(new PassThroughThenCleanup<Integer>(new CleanupOperation() {
@Override
void cleanup(PipelineOptions options) throws Exception {
// no-op
}}));
PAssert.that(output).containsInAnyOrder(1, 2, 3);
p.run();
}
@Test
public void testPassThroughThenCleanupExecuted() throws Exception {
p.apply(Create.empty(VarIntCoder.of()))
.apply(new PassThroughThenCleanup<Integer>(new CleanupOperation() {
@Override
void cleanup(PipelineOptions options) throws Exception {
throw new RuntimeException("cleanup executed");
}}));
thrown.expect(RuntimeException.class);
thrown.expectMessage("cleanup executed");
p.run();
}
@Test
public void testWritePartitionEmptyData() throws Exception {
long numFiles = 0;
long fileSize = 0;
// An empty file is created for no input data. One partition is needed.
long expectedNumPartitions = 1;
testWritePartition(1, numFiles, fileSize, expectedNumPartitions);
}
@Test
public void testWritePartitionSinglePartition() throws Exception {
long numFiles = BatchLoads.MAX_NUM_FILES;
long fileSize = 1;
// One partition is needed.
long expectedNumPartitions = 1;
testWritePartition(2, numFiles, fileSize, expectedNumPartitions);
}
@Test
public void testWritePartitionManyFiles() throws Exception {
long numFiles = BatchLoads.MAX_NUM_FILES * 3;
long fileSize = 1;
// One partition is needed for each group of BigQueryWrite.MAX_NUM_FILES files.
long expectedNumPartitions = 3;
testWritePartition(2, numFiles, fileSize, expectedNumPartitions);
}
@Test
public void testWritePartitionLargeFileSize() throws Exception {
long numFiles = 10;
long fileSize = BatchLoads.MAX_SIZE_BYTES / 3;
// One partition is needed for each group of three files.
long expectedNumPartitions = 4;
testWritePartition(2, numFiles, fileSize, expectedNumPartitions);
}
private void testWritePartition(long numTables, long numFilesPerTable, long fileSize,
long expectedNumPartitionsPerTable)
throws Exception {
p.enableAbandonedNodeEnforcement(false);
// In the case where a static destination is specified (i.e. not through a dynamic table
// function) and there is no input data, WritePartition will generate an empty table. This
// code is to test that path.
boolean isSingleton = numTables == 1 && numFilesPerTable == 0;
List<ShardedKey<String>> expectedPartitions = Lists.newArrayList();
if (isSingleton) {
expectedPartitions.add(ShardedKey.<String>of(null, 1));
} else {
for (int i = 0; i < numTables; ++i) {
for (int j = 1; j <= expectedNumPartitionsPerTable; ++j) {
String tableName = String.format("project-id:dataset-id.tables%05d", i);
expectedPartitions.add(ShardedKey.of(tableName, j));
}
}
}
List<WriteBundlesToFiles.Result<String>> files = Lists.newArrayList();
Map<String, List<String>> filenamesPerTable = Maps.newHashMap();
for (int i = 0; i < numTables; ++i) {
String tableName = String.format("project-id:dataset-id.tables%05d", i);
List<String> filenames = filenamesPerTable.get(tableName);
if (filenames == null) {
filenames = Lists.newArrayList();
filenamesPerTable.put(tableName, filenames);
}
for (int j = 0; j < numFilesPerTable; ++j) {
String fileName = String.format("%s_files%05d", tableName, j);
filenames.add(fileName);
files.add(new Result<>(fileName, fileSize, tableName));
}
}
TupleTag<KV<ShardedKey<String>, List<String>>> multiPartitionsTag =
new TupleTag<KV<ShardedKey<String>, List<String>>>("multiPartitionsTag") {};
TupleTag<KV<ShardedKey<String>, List<String>>> singlePartitionTag =
new TupleTag<KV<ShardedKey<String>, List<String>>>("singlePartitionTag") {};
PCollectionView<Iterable<WriteBundlesToFiles.Result<String>>> resultsView =
p.apply(
Create.of(files)
.withCoder(WriteBundlesToFiles.ResultCoder.of(StringUtf8Coder.of())))
.apply(View.<WriteBundlesToFiles.Result<String>>asIterable());
String tempFilePrefix = testFolder.newFolder("BigQueryIOTest").getAbsolutePath();
PCollectionView<String> tempFilePrefixView =
p.apply(Create.of(tempFilePrefix)).apply(View.<String>asSingleton());
WritePartition<String> writePartition =
new WritePartition<>(
isSingleton, tempFilePrefixView, resultsView, multiPartitionsTag, singlePartitionTag);
DoFnTester<Void, KV<ShardedKey<String>, List<String>>> tester =
DoFnTester.of(writePartition);
tester.setSideInput(resultsView, GlobalWindow.INSTANCE, files);
tester.setSideInput(tempFilePrefixView, GlobalWindow.INSTANCE, tempFilePrefix);
tester.processElement(null);
List<KV<ShardedKey<String>, List<String>>> partitions;
if (expectedNumPartitionsPerTable > 1) {
partitions = tester.takeOutputElements(multiPartitionsTag);
} else {
partitions = tester.takeOutputElements(singlePartitionTag);
}
List<ShardedKey<String>> partitionsResult = Lists.newArrayList();
Map<String, List<String>> filesPerTableResult = Maps.newHashMap();
for (KV<ShardedKey<String>, List<String>> partition : partitions) {
String table = partition.getKey().getKey();
partitionsResult.add(partition.getKey());
List<String> tableFilesResult = filesPerTableResult.get(table);
if (tableFilesResult == null) {
tableFilesResult = Lists.newArrayList();
filesPerTableResult.put(table, tableFilesResult);
}
tableFilesResult.addAll(partition.getValue());
}
assertThat(partitionsResult,
containsInAnyOrder(Iterables.toArray(expectedPartitions, ShardedKey.class)));
if (isSingleton) {
assertEquals(1, filesPerTableResult.size());
List<String> singletonFiles = filesPerTableResult.values().iterator().next();
assertTrue(Files.exists(Paths.get(singletonFiles.get(0))));
assertThat(Files.readAllBytes(Paths.get(singletonFiles.get(0))).length,
Matchers.equalTo(0));
} else {
assertEquals(filenamesPerTable, filesPerTableResult);
}
}
static class IdentityDynamicTables extends DynamicDestinations<String, String> {
@Override
public String getDestination(ValueInSingleWindow<String> element) {
throw new UnsupportedOperationException("getDestination not expected in this test.");
}
@Override
public TableDestination getTable(String destination) {
return new TableDestination(destination, destination);
}
@Override
public TableSchema getSchema(String destination) {
throw new UnsupportedOperationException("getSchema not expected in this test.");
}
}
@Test
public void testWriteTables() throws Exception {
p.enableAbandonedNodeEnforcement(false);
FakeDatasetService datasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(new FakeJobService())
.withDatasetService(datasetService);
datasetService.createDataset("project-id", "dataset-id", "", "");
long numTables = 3;
long numPartitions = 3;
long numFilesPerPartition = 10;
String jobIdToken = "jobIdToken";
String stepUuid = "stepUuid";
Map<TableDestination, List<String>> expectedTempTables = Maps.newHashMap();
Path baseDir = Files.createTempDirectory(tempFolder, "testWriteTables");
List<KV<ShardedKey<String>, List<String>>> partitions = Lists.newArrayList();
for (int i = 0; i < numTables; ++i) {
String tableName = String.format("project-id:dataset-id.table%05d", i);
TableDestination tableDestination = new TableDestination(tableName, tableName);
for (int j = 0; j < numPartitions; ++j) {
String tempTableId = BigQueryHelpers.createJobId(jobIdToken, tableDestination, j);
List<String> filesPerPartition = Lists.newArrayList();
for (int k = 0; k < numFilesPerPartition; ++k) {
String filename = Paths.get(baseDir.toString(),
String.format("files0x%08x_%05d", tempTableId.hashCode(), k)).toString();
ResourceId fileResource =
FileSystems.matchNewResource(filename, false /* isDirectory */);
try (WritableByteChannel channel = FileSystems.create(fileResource, MimeTypes.TEXT)) {
try (OutputStream output = Channels.newOutputStream(channel)) {
TableRow tableRow = new TableRow().set("name", tableName);
TableRowJsonCoder.of().encode(tableRow, output, Context.OUTER);
output.write("\n".getBytes(StandardCharsets.UTF_8));
}
}
filesPerPartition.add(filename);
}
partitions.add(KV.of(ShardedKey.of(tableDestination.getTableSpec(), j),
filesPerPartition));
List<String> expectedTables = expectedTempTables.get(tableDestination);
if (expectedTables == null) {
expectedTables = Lists.newArrayList();
expectedTempTables.put(tableDestination, expectedTables);
}
String json = String.format(
"{\"datasetId\":\"dataset-id\",\"projectId\":\"project-id\",\"tableId\":\"%s\"}",
tempTableId);
expectedTables.add(json);
}
}
PCollectionView<String> jobIdTokenView = p
.apply("CreateJobId", Create.of("jobId"))
.apply(View.<String>asSingleton());
PCollectionView<Map<String, String>> schemaMapView =
p.apply("CreateEmptySchema",
Create.empty(new TypeDescriptor<KV<String, String>>() {}))
.apply(View.<String, String>asMap());
WriteTables<String> writeTables =
new WriteTables<>(
false,
fakeBqServices,
jobIdTokenView,
schemaMapView,
WriteDisposition.WRITE_EMPTY,
CreateDisposition.CREATE_IF_NEEDED,
new IdentityDynamicTables());
DoFnTester<KV<ShardedKey<String>, List<String>>,
KV<TableDestination, String>> tester = DoFnTester.of(writeTables);
tester.setSideInput(jobIdTokenView, GlobalWindow.INSTANCE, jobIdToken);
tester.setSideInput(schemaMapView, GlobalWindow.INSTANCE, ImmutableMap.<String, String>of());
tester.getPipelineOptions().setTempLocation("tempLocation");
for (KV<ShardedKey<String>, List<String>> partition : partitions) {
tester.processElement(partition);
}
Map<TableDestination, List<String>> tempTablesResult = Maps.newHashMap();
for (KV<TableDestination, String> element : tester.takeOutputElements()) {
List<String> tables = tempTablesResult.get(element.getKey());
if (tables == null) {
tables = Lists.newArrayList();
tempTablesResult.put(element.getKey(), tables);
}
tables.add(element.getValue());
}
assertEquals(expectedTempTables, tempTablesResult);
}
@Test
public void testRemoveTemporaryFiles() throws Exception {
BigQueryOptions bqOptions = PipelineOptionsFactory.as(BigQueryOptions.class);
bqOptions.setProject("defaultproject");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
int numFiles = 10;
List<String> fileNames = Lists.newArrayList();
String tempFilePrefix = bqOptions.getTempLocation() + "/";
for (int i = 0; i < numFiles; ++i) {
TableRowWriter writer = new TableRowWriter(tempFilePrefix);
writer.close();
fileNames.add(writer.getResult().resourceId.toString());
}
fileNames.add(tempFilePrefix + String.format("files%05d", numFiles));
File tempDir = new File(bqOptions.getTempLocation());
testNumFiles(tempDir, 10);
WriteTables.removeTemporaryFiles(fileNames);
testNumFiles(tempDir, 0);
}
@Test
public void testWriteRename() throws Exception {
p.enableAbandonedNodeEnforcement(false);
FakeDatasetService datasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(new FakeJobService())
.withDatasetService(datasetService);
datasetService.createDataset("project-id", "dataset-id", "", "");
final int numFinalTables = 3;
final int numTempTablesPerFinalTable = 3;
final int numRecordsPerTempTable = 10;
Map<TableDestination, List<TableRow>> expectedRowsPerTable = Maps.newHashMap();
String jobIdToken = "jobIdToken";
Map<TableDestination, Iterable<String>> tempTables = Maps.newHashMap();
for (int i = 0; i < numFinalTables; ++i) {
String tableName = "project-id:dataset-id.table_" + i;
TableDestination tableDestination = new TableDestination(
tableName, "table_" + i + "_desc");
List<String> tables = Lists.newArrayList();
tempTables.put(tableDestination, tables);
List<TableRow> expectedRows = expectedRowsPerTable.get(tableDestination);
if (expectedRows == null) {
expectedRows = Lists.newArrayList();
expectedRowsPerTable.put(tableDestination, expectedRows);
}
for (int j = 0; i < numTempTablesPerFinalTable; ++i) {
TableReference tempTable = new TableReference()
.setProjectId("project-id")
.setDatasetId("dataset-id")
.setTableId(String.format("%s_%05d_%05d", jobIdToken, i, j));
datasetService.createTable(new Table().setTableReference(tempTable));
List<TableRow> rows = Lists.newArrayList();
for (int k = 0; k < numRecordsPerTempTable; ++k) {
rows.add(new TableRow().set("number", j * numTempTablesPerFinalTable + k));
}
datasetService.insertAll(tempTable, rows, null);
expectedRows.addAll(rows);
tables.add(BigQueryHelpers.toJsonString(tempTable));
}
}
PCollection<KV<TableDestination, String>> tempTablesPCollection =
p.apply(Create.of(tempTables)
.withCoder(KvCoder.of(TableDestinationCoder.of(),
IterableCoder.of(StringUtf8Coder.of()))))
.apply(ParDo.of(new DoFn<KV<TableDestination, Iterable<String>>,
KV<TableDestination, String>>() {
@ProcessElement
public void processElement(ProcessContext c) {
TableDestination tableDestination = c.element().getKey();
for (String tempTable : c.element().getValue()) {
c.output(KV.of(tableDestination, tempTable));
}
}
}));
PCollectionView<Map<TableDestination, Iterable<String>>> tempTablesView =
PCollectionViews.multimapView(
tempTablesPCollection,
WindowingStrategy.globalDefault(),
KvCoder.of(TableDestinationCoder.of(),
StringUtf8Coder.of()));
PCollectionView<String> jobIdTokenView = p
.apply("CreateJobId", Create.of("jobId"))
.apply(View.<String>asSingleton());
WriteRename writeRename = new WriteRename(
fakeBqServices,
jobIdTokenView,
WriteDisposition.WRITE_EMPTY,
CreateDisposition.CREATE_IF_NEEDED,
tempTablesView);
DoFnTester<Void, Void> tester = DoFnTester.of(writeRename);
tester.setSideInput(tempTablesView, GlobalWindow.INSTANCE, tempTables);
tester.setSideInput(jobIdTokenView, GlobalWindow.INSTANCE, jobIdToken);
tester.processElement(null);
for (Map.Entry<TableDestination, Iterable<String>> entry : tempTables.entrySet()) {
TableDestination tableDestination = entry.getKey();
TableReference tableReference = tableDestination.getTableReference();
Table table = checkNotNull(datasetService.getTable(tableReference));
assertEquals(tableReference.getTableId() + "_desc", tableDestination.getTableDescription());
List<TableRow> expectedRows = expectedRowsPerTable.get(tableDestination);
assertThat(datasetService.getAllRows(tableReference.getProjectId(),
tableReference.getDatasetId(), tableReference.getTableId()),
containsInAnyOrder(Iterables.toArray(expectedRows, TableRow.class)));
// Temp tables should be deleted.
for (String tempTableJson : entry.getValue()) {
TableReference tempTable = BigQueryHelpers.fromJsonString(
tempTableJson, TableReference.class);
assertEquals(null, datasetService.getTable(tempTable));
}
}
}
@Test
public void testRemoveTemporaryTables() throws Exception {
FakeDatasetService datasetService = new FakeDatasetService();
String projectId = "project";
String datasetId = "dataset";
datasetService.createDataset(projectId, datasetId, "", "");
List<TableReference> tableRefs = Lists.newArrayList(
BigQueryHelpers.parseTableSpec(String.format("%s:%s.%s", projectId, datasetId, "table1")),
BigQueryHelpers.parseTableSpec(String.format("%s:%s.%s", projectId, datasetId, "table2")),
BigQueryHelpers.parseTableSpec(String.format("%s:%s.%s", projectId, datasetId, "table3")));
for (TableReference tableRef : tableRefs) {
datasetService.createTable(new Table().setTableReference(tableRef));
}
// Add one more table to delete that does not actually exist.
tableRefs.add(
BigQueryHelpers.parseTableSpec(String.format("%s:%s.%s", projectId, datasetId, "table4")));
WriteRename.removeTemporaryTables(datasetService, tableRefs);
for (TableReference ref : tableRefs) {
loggedWriteRename.verifyDebug("Deleting table " + toJsonString(ref));
checkState(datasetService.getTable(ref) == null,
"Table " + ref + " was not deleted!");
}
}
/** Test options. **/
public interface RuntimeTestOptions extends PipelineOptions {
ValueProvider<String> getInputTable();
void setInputTable(ValueProvider<String> value);
ValueProvider<String> getInputQuery();
void setInputQuery(ValueProvider<String> value);
ValueProvider<String> getOutputTable();
void setOutputTable(ValueProvider<String> value);
ValueProvider<String> getOutputSchema();
void setOutputSchema(ValueProvider<String> value);
}
@Test
public void testRuntimeOptionsNotCalledInApplyInputTable() {
RuntimeTestOptions options = PipelineOptionsFactory.as(RuntimeTestOptions.class);
BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
bqOptions.setTempLocation("gs://testbucket/testdir");
Pipeline pipeline = TestPipeline.create(options);
BigQueryIO.Read read = BigQueryIO.read().from(
options.getInputTable()).withoutValidation();
pipeline.apply(read);
// Test that this doesn't throw.
DisplayData.from(read);
}
@Test
public void testRuntimeOptionsNotCalledInApplyInputQuery() {
RuntimeTestOptions options = PipelineOptionsFactory.as(RuntimeTestOptions.class);
BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
bqOptions.setTempLocation("gs://testbucket/testdir");
Pipeline pipeline = TestPipeline.create(options);
BigQueryIO.Read read = BigQueryIO.read().fromQuery(
options.getInputQuery()).withoutValidation();
pipeline.apply(read);
// Test that this doesn't throw.
DisplayData.from(read);
}
@Test
public void testRuntimeOptionsNotCalledInApplyOutput() {
RuntimeTestOptions options = PipelineOptionsFactory.as(RuntimeTestOptions.class);
BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
bqOptions.setTempLocation("gs://testbucket/testdir");
Pipeline pipeline = TestPipeline.create(options);
BigQueryIO.Write<TableRow> write = BigQueryIO.writeTableRows()
.to(options.getOutputTable())
.withSchema(NestedValueProvider.of(
options.getOutputSchema(), new JsonSchemaToTableSchema()))
.withoutValidation();
pipeline
.apply(Create.empty(TableRowJsonCoder.of()))
.apply(write);
// Test that this doesn't throw.
DisplayData.from(write);
}
private static void testNumFiles(File tempDir, int expectedNumFiles) {
assertEquals(expectedNumFiles, tempDir.listFiles(new FileFilter() {
@Override
public boolean accept(File pathname) {
return pathname.isFile();
}}).length);
}
@Test
public void testShardedKeyCoderIsSerializableWithWellKnownCoderType() {
CoderProperties.coderSerializable(ShardedKeyCoder.of(GlobalWindow.Coder.INSTANCE));
}
@Test
public void testTableRowInfoCoderSerializable() {
CoderProperties.coderSerializable(TableRowInfoCoder.of());
}
@Test
public void testComplexCoderSerializable() {
CoderProperties.coderSerializable(
WindowedValue.getFullCoder(
KvCoder.of(
ShardedKeyCoder.of(StringUtf8Coder.of()),
TableRowInfoCoder.of()),
IntervalWindow.getCoder()));
}
List<TableRow> convertBigDecimaslToLong(List<TableRow> toConvert) {
// The numbers come back as BigDecimal objects after JSON serialization. Change them back to
// longs so that we can assert the output.
List<TableRow> converted = Lists.newArrayList();
for (TableRow entry : toConvert) {
TableRow convertedEntry = entry.clone();
Object num = convertedEntry.get("number");
if (num instanceof BigDecimal) {
convertedEntry.set("number", ((BigDecimal) num).longValue());
}
converted.add(convertedEntry);
}
return converted;
}
}