/**
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.data.crunch;
import com.google.common.io.Files;
import java.io.IOException;
import java.net.URI;
import java.util.Arrays;
import java.util.Collection;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericData.Record;
import org.apache.avro.generic.GenericRecord;
import org.apache.crunch.CrunchRuntimeException;
import org.apache.crunch.MapFn;
import org.apache.crunch.PCollection;
import org.apache.crunch.Pipeline;
import org.apache.crunch.Target;
import org.apache.crunch.impl.mr.MRPipeline;
import org.apache.crunch.types.avro.Avros;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.After;
import org.junit.Assert;
import org.junit.Assume;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import org.kitesdk.compat.Hadoop;
import org.kitesdk.data.Dataset;
import org.kitesdk.data.DatasetDescriptor;
import org.kitesdk.data.DatasetReader;
import org.kitesdk.data.DatasetWriter;
import org.kitesdk.data.Datasets;
import org.kitesdk.data.Formats;
import org.kitesdk.data.MiniDFSTest;
import org.kitesdk.data.Signalable;
import org.kitesdk.data.spi.PartitionKey;
import org.kitesdk.data.PartitionStrategy;
import org.kitesdk.data.spi.DatasetRepository;
import org.kitesdk.data.spi.PartitionedDataset;
import org.kitesdk.data.View;
import org.kitesdk.data.spi.LastModifiedAccessor;
import org.kitesdk.data.URIBuilder;
import org.kitesdk.data.user.NewUserRecord;
import static org.kitesdk.data.spi.filesystem.DatasetTestUtilities.USER_SCHEMA;
import static org.kitesdk.data.spi.filesystem.DatasetTestUtilities.checkTestUsers;
import static org.kitesdk.data.spi.filesystem.DatasetTestUtilities.datasetSize;
import static org.kitesdk.data.spi.filesystem.DatasetTestUtilities.writeTestUsers;
@RunWith(Parameterized.class)
public abstract class TestCrunchDatasets extends MiniDFSTest {
@Parameterized.Parameters
public static Collection<Object[]> data() throws IOException {
MiniDFSTest.setupFS();
Object[][] data = new Object[][] {
{ getDFS() },
{ getFS() }
};
return Arrays.asList(data);
}
protected FileSystem fileSystem;
protected Path testDirectory;
private DatasetRepository repo;
public TestCrunchDatasets(FileSystem fs) {
this.fileSystem = fs;
testDirectory = fileSystem.makeQualified(
new Path(Files.createTempDir().getAbsolutePath()));
}
abstract public DatasetRepository newRepo();
@Before
public void setUp() throws Exception {
this.repo = newRepo();
repo.delete("ns", "in");
repo.delete("ns", "out");
}
@After
public void tearDown() throws IOException {
fileSystem.delete(testDirectory, true);
}
@Test
public void testGeneric() throws IOException {
Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
// write two files, each of 5 records
writeTestUsers(inputDataset, 5, 0);
writeTestUsers(inputDataset, 5, 5);
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputDataset));
pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
pipeline.run();
checkTestUsers(outputDataset, 10);
}
@Test
public void testGenericParquet() throws IOException {
Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).format(Formats.PARQUET).build());
Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).format(Formats.PARQUET).build());
// write two files, each of 5 records
writeTestUsers(inputDataset, 5, 0);
writeTestUsers(inputDataset, 5, 5);
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputDataset));
pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
pipeline.run();
checkTestUsers(outputDataset, 10);
}
@Test
public void testPartitionedSource() throws IOException {
PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
"username", 2).build();
Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).format(Formats.PARQUET).build());
writeTestUsers(inputDataset, 10);
PartitionKey key = new PartitionKey(0);
Dataset<Record> inputPart0 =
((PartitionedDataset<Record>) inputDataset).getPartition(key, false);
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputPart0));
pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
pipeline.run();
Assert.assertEquals(5, datasetSize(outputDataset));
}
@Test
public void testPartitionedSourceAndTarget() throws IOException {
PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
"username", 2).build();
Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
writeTestUsers(inputDataset, 10);
PartitionKey key = new PartitionKey(0);
Dataset<Record> inputPart0 =
((PartitionedDataset<Record>) inputDataset).getPartition(key, false);
Dataset<Record> outputPart0 =
((PartitionedDataset<Record>) outputDataset).getPartition(key, true);
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputPart0));
pipeline.write(data, CrunchDatasets.asTarget(outputPart0), Target.WriteMode.APPEND);
pipeline.run();
Assert.assertEquals(5, datasetSize(outputPart0));
}
@Test
public void testPartitionedSourceAndTargetWritingToTopLevel() throws IOException {
PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
"username", 2).build();
Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
writeTestUsers(inputDataset, 10);
PartitionKey key = new PartitionKey(0);
Dataset<Record> inputPart0 =
((PartitionedDataset<Record>) inputDataset).getPartition(key, false);
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputPart0));
pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
pipeline.run();
Assert.assertEquals(5, datasetSize(outputDataset));
// check all records are in the correct partition
Dataset<Record> outputPart0 =
((PartitionedDataset<Record>) outputDataset).getPartition(key, false);
Assert.assertNotNull(outputPart0);
Assert.assertEquals(5, datasetSize(outputPart0));
}
@Test
public void testSourceView() throws IOException {
PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
"username", 2).build();
Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).format(Formats.PARQUET).build());
writeTestUsers(inputDataset, 10);
View<Record> inputView = inputDataset.with("username", "test-0");
Assert.assertEquals(1, datasetSize(inputView));
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputView));
pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
pipeline.run();
Assert.assertEquals(1, datasetSize(outputDataset));
}
@Test
public void testTargetView() throws IOException {
PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
"username", 2).build();
Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
writeTestUsers(inputDataset, 10);
View<Record> inputView = inputDataset.with("username", "test-0");
Assert.assertEquals(1, datasetSize(inputView));
View<Record> outputView = outputDataset.with("username", "test-0");
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputView));
pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
pipeline.run();
Assert.assertEquals(1, datasetSize(outputDataset));
}
@Test
public void testTargetViewProvidedPartition() throws IOException {
PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().provided("version").build();
Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
View<Record> inputView = inputDataset.with("version", "test-version-0");
writeTestUsers(inputView, 1);
Assert.assertEquals(1, datasetSize(inputView));
View<Record> outputView = outputDataset.with("version", "test-version-0");
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputView));
pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
pipeline.run();
Assert.assertEquals(1, datasetSize(outputDataset));
}
@Test
public void testViewUris() throws IOException {
PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
"username", 2).build();
Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
writeTestUsers(inputDataset, 10);
URI sourceViewUri = new URIBuilder(repo.getUri(), "ns", "in").with("username",
"test-0").build();
View<Record> inputView = Datasets.<Record, Dataset<Record>> load(sourceViewUri,
Record.class);
Assert.assertEquals(1, datasetSize(inputView));
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets
.asSource(sourceViewUri, GenericData.Record.class));
URI targetViewUri = new URIBuilder(repo.getUri(), "ns", "out").with(
"email", "email-0").build();
pipeline.write(data, CrunchDatasets.asTarget(targetViewUri),
Target.WriteMode.APPEND);
pipeline.run();
Assert.assertEquals(1, datasetSize(outputDataset));
}
@Test
public void testDatasetUris() throws IOException {
PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
"username", 2).build();
Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
writeTestUsers(inputDataset, 10);
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(new URIBuilder(repo.getUri(), "ns", "in").build(),
GenericData.Record.class));
pipeline.write(data, CrunchDatasets.asTarget(
new URIBuilder(repo.getUri(), "ns", "out").build()), Target.WriteMode.APPEND);
pipeline.run();
Assert.assertEquals(10, datasetSize(outputDataset));
}
@Test(expected = CrunchRuntimeException.class)
public void testWriteModeDefaultFailsWithExisting() throws IOException {
Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
writeTestUsers(inputDataset, 1, 0);
writeTestUsers(outputDataset, 1, 0);
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputDataset));
pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset));
}
@Test
public void testWriteModeOverwrite() throws IOException {
Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
writeTestUsers(inputDataset, 1, 0);
writeTestUsers(outputDataset, 1, 1);
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputDataset));
pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset),
Target.WriteMode.OVERWRITE);
pipeline.run();
checkTestUsers(outputDataset, 1);
}
@Test
public void testWriteModeCheckpoint() throws Exception {
Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
writeTestUsers(inputDataset, 1, 0);
Thread.sleep(1000); // ensure output is newer than input on local filesystems with 1s granularity
runCheckpointPipeline(inputDataset, outputDataset);
// under hadoop1 the issues with LocalJobRunner (MAPREDUCE-2350) require that we
// manually ready the output dataset
if (Hadoop.isHadoop1()) {
((Signalable)outputDataset).signalReady();
}
checkTestUsers(outputDataset, 1);
long lastModified = ((LastModifiedAccessor) outputDataset).getLastModified();
// re-run without changing input and output should not change
runCheckpointPipeline(inputDataset, outputDataset);
checkTestUsers(outputDataset, 1);
Assert.assertEquals(lastModified, ((LastModifiedAccessor) outputDataset).getLastModified());
// re-write input then re-run and output should be re-written
Thread.sleep(1000); // ensure new input is newer than output
repo.delete("ns", "in");
inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
writeTestUsers(inputDataset, 1, 0);
runCheckpointPipeline(inputDataset, outputDataset);
checkTestUsers(outputDataset, 1);
Assert.assertTrue(((LastModifiedAccessor) outputDataset).getLastModified() > lastModified);
}
@Test
public void testWriteModeCheckpointToNotReadyOutput() throws Exception {
//identity partition so we can overwrite the output
PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().
identity("username").build();
Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.
Builder().schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.
Builder().schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
writeTestUsers(inputDataset, 1, 0);
// ensure output is newer than input on local filesystems with 1s granularity
Thread.sleep(1000);
runCheckpointPipeline(inputDataset, outputDataset);
checkTestUsers(outputDataset, 1);
// under hadoop1 the issues with LocalJobRunner (MAPREDUCE-2350) require that we
// manually ready the output dataset
if (Hadoop.isHadoop1()) {
((Signalable)outputDataset).signalReady();
} else {
//under hadoop2 the output will have been marked ready
Assert.assertTrue("output dataset should be ready after mapreduce", ((Signalable)outputDataset).isReady());
}
long lastModified = ((LastModifiedAccessor) outputDataset).getLastModified();
// ensure output is newer than input on local filesystems with 1s granularity
Thread.sleep(1000);
// now output to a view, this ensures that the view isn't ready
View<Record> outputView = outputDataset.with("username", "test-0");
// re-run without changing input and output should change since the view is not ready
runCheckpointPipeline(inputDataset, outputView);
checkTestUsers(outputDataset, 1);
Assert.assertTrue(((LastModifiedAccessor) outputView).getLastModified() > lastModified);
}
// Statically typed identify function to ensure the expected record is used.
static class UserRecordIdentityFn extends MapFn<NewUserRecord, NewUserRecord> {
@Override
public NewUserRecord map(NewUserRecord input) {
return input;
}
}
@Test
public void testUseReaderSchema() throws IOException {
// Create a schema with only a username, so we can test reading it
// with an enhanced record structure.
Schema oldRecordSchema = SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord")
.fields()
.requiredString("username")
.endRecord();
// create the dataset
Dataset<Record> in = repo.create("ns", "in", new DatasetDescriptor.Builder()
.schema(oldRecordSchema).build());
Dataset<Record> out = repo.create("ns", "out", new DatasetDescriptor.Builder()
.schema(oldRecordSchema).build());
Record oldUser = new Record(oldRecordSchema);
oldUser.put("username", "user");
DatasetWriter<Record> writer = in.newWriter();
try {
writer.write(oldUser);
} finally {
writer.close();
}
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
// read data from updated dataset that has the new schema.
// At this point, User class has the old schema
PCollection<NewUserRecord> data = pipeline.read(CrunchDatasets.asSource(in.getUri(),
NewUserRecord.class));
PCollection<NewUserRecord> processed = data.parallelDo(new UserRecordIdentityFn(),
Avros.records(NewUserRecord.class));
pipeline.write(processed, CrunchDatasets.asTarget(out));
DatasetReader reader = out.newReader();
Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded());
try {
// there should be one record that is equal to our old user generic record.
Assert.assertEquals(oldUser, reader.next());
Assert.assertFalse(reader.hasNext());
} finally {
reader.close();
}
}
@Test
public void testUseReaderSchemaParquet() throws IOException {
// Create a schema with only a username, so we can test reading it
// with an enhanced record structure.
Schema oldRecordSchema = SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord")
.fields()
.requiredString("username")
.endRecord();
// create the dataset
Dataset<Record> in = repo.create("ns", "in", new DatasetDescriptor.Builder()
.format(Formats.PARQUET).schema(oldRecordSchema).build());
Dataset<Record> out = repo.create("ns", "out", new DatasetDescriptor.Builder()
.format(Formats.PARQUET).schema(oldRecordSchema).build());
Record oldUser = new Record(oldRecordSchema);
oldUser.put("username", "user");
DatasetWriter<Record> writer = in.newWriter();
try {
writer.write(oldUser);
} finally {
writer.close();
}
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
// read data from updated dataset that has the new schema.
// At this point, User class has the old schema
PCollection<NewUserRecord> data = pipeline.read(CrunchDatasets.asSource(in.getUri(),
NewUserRecord.class));
PCollection<NewUserRecord> processed = data.parallelDo(new UserRecordIdentityFn(),
Avros.records(NewUserRecord.class));
pipeline.write(processed, CrunchDatasets.asTarget(out));
DatasetReader reader = out.newReader();
Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded());
try {
// there should be one record that is equal to our old user generic record.
Assert.assertEquals(oldUser, reader.next());
Assert.assertFalse(reader.hasNext());
} finally {
reader.close();
}
}
@Test
public void testSignalReadyOutputView() {
Assume.assumeTrue(!Hadoop.isHadoop1());
Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
writeTestUsers(inputDataset, 10);
View<Record> inputView = inputDataset.with("username", "test-8", "test-9");
View<Record> outputView = outputDataset.with("username", "test-8", "test-9");
Assert.assertEquals(2, datasetSize(inputView));
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputView));
pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
pipeline.run();
Assert.assertEquals(2, datasetSize(outputView));
Assert.assertFalse("Output dataset should not be signaled ready",
((Signalable)outputDataset).isReady());
Assert.assertTrue("Output view should be signaled ready",
((Signalable)outputView).isReady());
}
private void runCheckpointPipeline(View<Record> inputView,
View<Record> outputView) {
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputView));
pipeline.write(data, CrunchDatasets.asTarget(outputView),
Target.WriteMode.CHECKPOINT);
pipeline.done();
}
@Test
public void testMultipleFileReadingFromCrunch() throws IOException {
Dataset<Record> inputDatasetA = repo.create("ns", "inA", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
Dataset<Record> inputDatasetB = repo.create("ns", "inB", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
// write two files, each of 5 records
writeTestUsers(inputDatasetA, 5, 0);
writeTestUsers(inputDatasetB, 5, 5);
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> dataA = pipeline.read(
CrunchDatasets.asSource(inputDatasetA));
PCollection<GenericData.Record> dataB = pipeline.read(
CrunchDatasets.asSource(inputDatasetB));
pipeline.write(dataA.union(dataB), CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
pipeline.run();
checkTestUsers(outputDataset, 10);
}
}