/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.io;
import static com.google.common.base.MoreObjects.firstNonNull;
import static org.apache.beam.sdk.TestUtils.LINES2_ARRAY;
import static org.apache.beam.sdk.TestUtils.LINES_ARRAY;
import static org.apache.beam.sdk.TestUtils.NO_LINES_ARRAY;
import static org.apache.beam.sdk.io.TextIO.CompressionType.AUTO;
import static org.apache.beam.sdk.io.TextIO.CompressionType.BZIP2;
import static org.apache.beam.sdk.io.TextIO.CompressionType.DEFLATE;
import static org.apache.beam.sdk.io.TextIO.CompressionType.GZIP;
import static org.apache.beam.sdk.io.TextIO.CompressionType.UNCOMPRESSED;
import static org.apache.beam.sdk.io.TextIO.CompressionType.ZIP;
import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem;
import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasValue;
import static org.hamcrest.Matchers.containsInAnyOrder;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.hasItem;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.startsWith;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import javax.annotation.Nullable;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.io.BoundedSource.BoundedReader;
import org.apache.beam.sdk.io.FileBasedSink.WritableByteChannelFactory;
import org.apache.beam.sdk.io.TextIO.CompressionType;
import org.apache.beam.sdk.io.fs.MatchResult;
import org.apache.beam.sdk.io.fs.MatchResult.Metadata;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.testing.NeedsRunner;
import org.apache.beam.sdk.testing.PAssert;
import org.apache.beam.sdk.testing.SourceTestUtils;
import org.apache.beam.sdk.testing.TestPipeline;
import org.apache.beam.sdk.testing.ValidatesRunner;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.transforms.display.DisplayDataEvaluator;
import org.apache.beam.sdk.util.CoderUtils;
import org.apache.beam.sdk.values.PCollection;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
import org.apache.commons.compress.compressors.deflate.DeflateCompressorOutputStream;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.rules.ExpectedException;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
/**
* Tests for {@link TextIO} {@link TextIO.Read} and {@link TextIO.Write} transforms.
*/
// TODO: Change the tests to use ValidatesRunner instead of NeedsRunner
@RunWith(JUnit4.class)
@SuppressWarnings("unchecked")
public class TextIOTest {
private static final String MY_HEADER = "myHeader";
private static final String MY_FOOTER = "myFooter";
private static final String[] EMPTY = new String[] {};
private static final String[] TINY =
new String[] {"Irritable eagle", "Optimistic jay", "Fanciful hawk"};
private static final String[] LARGE = makeLines(1000);
private static Path tempFolder;
private static File emptyTxt;
private static File tinyTxt;
private static File largeTxt;
private static File emptyGz;
private static File tinyGz;
private static File largeGz;
private static File emptyBzip2;
private static File tinyBzip2;
private static File largeBzip2;
private static File emptyZip;
private static File tinyZip;
private static File largeZip;
private static File emptyDeflate;
private static File tinyDeflate;
private static File largeDeflate;
@Rule
public TestPipeline p = TestPipeline.create();
@Rule
public ExpectedException expectedException = ExpectedException.none();
private static File writeToFile(String[] lines, String filename, CompressionType compression)
throws IOException {
File file = tempFolder.resolve(filename).toFile();
OutputStream output = new FileOutputStream(file);
switch (compression) {
case UNCOMPRESSED:
break;
case GZIP:
output = new GZIPOutputStream(output);
break;
case BZIP2:
output = new BZip2CompressorOutputStream(output);
break;
case ZIP:
ZipOutputStream zipOutput = new ZipOutputStream(output);
zipOutput.putNextEntry(new ZipEntry("entry"));
output = zipOutput;
break;
case DEFLATE:
output = new DeflateCompressorOutputStream(output);
break;
default:
throw new UnsupportedOperationException(compression.toString());
}
writeToStreamAndClose(lines, output);
return file;
}
@BeforeClass
public static void setupClass() throws IOException {
tempFolder = Files.createTempDirectory("TextIOTest");
// empty files
emptyTxt = writeToFile(EMPTY, "empty.txt", CompressionType.UNCOMPRESSED);
emptyGz = writeToFile(EMPTY, "empty.gz", GZIP);
emptyBzip2 = writeToFile(EMPTY, "empty.bz2", BZIP2);
emptyZip = writeToFile(EMPTY, "empty.zip", ZIP);
emptyDeflate = writeToFile(EMPTY, "empty.deflate", DEFLATE);
// tiny files
tinyTxt = writeToFile(TINY, "tiny.txt", CompressionType.UNCOMPRESSED);
tinyGz = writeToFile(TINY, "tiny.gz", GZIP);
tinyBzip2 = writeToFile(TINY, "tiny.bz2", BZIP2);
tinyZip = writeToFile(TINY, "tiny.zip", ZIP);
tinyDeflate = writeToFile(TINY, "tiny.deflate", DEFLATE);
// large files
largeTxt = writeToFile(LARGE, "large.txt", CompressionType.UNCOMPRESSED);
largeGz = writeToFile(LARGE, "large.gz", GZIP);
largeBzip2 = writeToFile(LARGE, "large.bz2", BZIP2);
largeZip = writeToFile(LARGE, "large.zip", ZIP);
largeDeflate = writeToFile(LARGE, "large.deflate", DEFLATE);
}
@AfterClass
public static void teardownClass() throws IOException {
Files.walkFileTree(tempFolder, new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
Files.delete(file);
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
Files.delete(dir);
return FileVisitResult.CONTINUE;
}
});
}
private <T> void runTestRead(String[] expected) throws Exception {
File tmpFile = Files.createTempFile(tempFolder, "file", "txt").toFile();
String filename = tmpFile.getPath();
try (PrintStream writer = new PrintStream(new FileOutputStream(tmpFile))) {
for (String elem : expected) {
byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem);
String line = new String(encodedElem);
writer.println(line);
}
}
TextIO.Read read = TextIO.read().from(filename);
PCollection<String> output = p.apply(read);
PAssert.that(output).containsInAnyOrder(expected);
p.run();
}
@Test
@Category(NeedsRunner.class)
public void testReadStrings() throws Exception {
runTestRead(LINES_ARRAY);
}
@Test
@Category(NeedsRunner.class)
public void testReadEmptyStrings() throws Exception {
runTestRead(NO_LINES_ARRAY);
}
@Test
public void testReadNamed() throws Exception {
p.enableAbandonedNodeEnforcement(false);
assertEquals(
"TextIO.Read/Read.out",
p.apply(TextIO.read().from("somefile")).getName());
assertEquals(
"MyRead/Read.out",
p.apply("MyRead", TextIO.read().from(emptyTxt.getPath())).getName());
}
@Test
public void testReadDisplayData() {
TextIO.Read read = TextIO.read()
.from("foo.*")
.withCompressionType(BZIP2);
DisplayData displayData = DisplayData.from(read);
assertThat(displayData, hasDisplayItem("filePattern", "foo.*"));
assertThat(displayData, hasDisplayItem("compressionType", BZIP2.toString()));
}
@Test
@Category(ValidatesRunner.class)
public void testPrimitiveReadDisplayData() {
DisplayDataEvaluator evaluator = DisplayDataEvaluator.create();
TextIO.Read read = TextIO.read()
.from("foobar");
Set<DisplayData> displayData = evaluator.displayDataForPrimitiveSourceTransforms(read);
assertThat("TextIO.Read should include the file prefix in its primitive display data",
displayData, hasItem(hasDisplayItem(hasValue(startsWith("foobar")))));
}
private void runTestWrite(String[] elems) throws Exception {
runTestWrite(elems, null, null, 1);
}
private void runTestWrite(String[] elems, int numShards) throws Exception {
runTestWrite(elems, null, null, numShards);
}
private void runTestWrite(String[] elems, String header, String footer)
throws Exception {
runTestWrite(elems, header, footer, 1);
}
private void runTestWrite(
String[] elems, String header, String footer, int numShards) throws Exception {
String outputName = "file.txt";
Path baseDir = Files.createTempDirectory(tempFolder, "testwrite");
String baseFilename = baseDir.resolve(outputName).toString();
PCollection<String> input =
p.apply(Create.of(Arrays.asList(elems)).withCoder(StringUtf8Coder.of()));
TextIO.Write write =
TextIO.write().to(baseFilename)
.withHeader(header)
.withFooter(footer);
if (numShards == 1) {
write = write.withoutSharding();
} else if (numShards > 0) {
write = write.withNumShards(numShards).withShardNameTemplate(ShardNameTemplate.INDEX_OF_MAX);
}
input.apply(write);
p.run();
assertOutputFiles(elems, header, footer, numShards, baseDir, outputName,
firstNonNull(write.getShardTemplate(), DefaultFilenamePolicy.DEFAULT_SHARD_TEMPLATE));
}
public static void assertOutputFiles(
String[] elems,
final String header,
final String footer,
int numShards,
Path rootLocation,
String outputName,
String shardNameTemplate)
throws Exception {
List<File> expectedFiles = new ArrayList<>();
if (numShards == 0) {
String pattern = rootLocation.toAbsolutePath().resolve(outputName + "*").toString();
List<MatchResult> matches = FileSystems.match(Collections.singletonList(pattern));
for (Metadata expectedFile : Iterables.getOnlyElement(matches).metadata()) {
expectedFiles.add(new File(expectedFile.resourceId().toString()));
}
} else {
for (int i = 0; i < numShards; i++) {
expectedFiles.add(
new File(
rootLocation.toString(),
DefaultFilenamePolicy.constructName(
outputName, shardNameTemplate, "", i, numShards)));
}
}
List<List<String>> actual = new ArrayList<>();
for (File tmpFile : expectedFiles) {
try (BufferedReader reader = new BufferedReader(new FileReader(tmpFile))) {
List<String> currentFile = new ArrayList<>();
for (;;) {
String line = reader.readLine();
if (line == null) {
break;
}
currentFile.add(line);
}
actual.add(currentFile);
}
}
List<String> expectedElements = new ArrayList<>(elems.length);
for (String elem : elems) {
byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem);
String line = new String(encodedElem);
expectedElements.add(line);
}
List<String> actualElements =
Lists.newArrayList(
Iterables.concat(
FluentIterable
.from(actual)
.transform(removeHeaderAndFooter(header, footer))
.toList()));
assertThat(actualElements, containsInAnyOrder(expectedElements.toArray()));
assertTrue(Iterables.all(actual, haveProperHeaderAndFooter(header, footer)));
}
private static Function<List<String>, List<String>> removeHeaderAndFooter(final String header,
final String footer) {
return new Function<List<String>, List<String>>() {
@Nullable
@Override
public List<String> apply(List<String> lines) {
ArrayList<String> newLines = Lists.newArrayList(lines);
if (header != null) {
newLines.remove(0);
}
if (footer != null) {
int last = newLines.size() - 1;
newLines.remove(last);
}
return newLines;
}
};
}
private static Predicate<List<String>> haveProperHeaderAndFooter(final String header,
final String footer) {
return new Predicate<List<String>>() {
@Override
public boolean apply(List<String> fileLines) {
int last = fileLines.size() - 1;
return (header == null || fileLines.get(0).equals(header))
&& (footer == null || fileLines.get(last).equals(footer));
}
};
}
@Test
@Category(NeedsRunner.class)
public void testWriteStrings() throws Exception {
runTestWrite(LINES_ARRAY);
}
@Test
@Category(NeedsRunner.class)
public void testWriteEmptyStringsNoSharding() throws Exception {
runTestWrite(NO_LINES_ARRAY, 0);
}
@Test
@Category(NeedsRunner.class)
public void testWriteEmptyStrings() throws Exception {
runTestWrite(NO_LINES_ARRAY);
}
@Test
@Category(NeedsRunner.class)
public void testShardedWrite() throws Exception {
runTestWrite(LINES_ARRAY, 5);
}
@Test
@Category(NeedsRunner.class)
public void testWriteWithHeader() throws Exception {
runTestWrite(LINES_ARRAY, MY_HEADER, null);
}
@Test
@Category(NeedsRunner.class)
public void testWriteWithFooter() throws Exception {
runTestWrite(LINES_ARRAY, null, MY_FOOTER);
}
@Test
@Category(NeedsRunner.class)
public void testWriteWithHeaderAndFooter() throws Exception {
runTestWrite(LINES_ARRAY, MY_HEADER, MY_FOOTER);
}
@Test
@Category(NeedsRunner.class)
public void testWriteWithWritableByteChannelFactory() throws Exception {
Coder<String> coder = StringUtf8Coder.of();
String outputName = "file.txt";
Path baseDir = Files.createTempDirectory(tempFolder, "testwrite");
PCollection<String> input = p.apply(Create.of(Arrays.asList(LINES2_ARRAY)).withCoder(coder));
final WritableByteChannelFactory writableByteChannelFactory =
new DrunkWritableByteChannelFactory();
TextIO.Write write = TextIO.write().to(baseDir.resolve(outputName).toString())
.withoutSharding().withWritableByteChannelFactory(writableByteChannelFactory);
DisplayData displayData = DisplayData.from(write);
assertThat(displayData, hasDisplayItem("writableByteChannelFactory", "DRUNK"));
input.apply(write);
p.run();
final List<String> drunkElems = new ArrayList<>(LINES2_ARRAY.length * 2 + 2);
for (String elem : LINES2_ARRAY) {
drunkElems.add(elem);
drunkElems.add(elem);
}
assertOutputFiles(drunkElems.toArray(new String[0]), null, null, 1, baseDir,
outputName + writableByteChannelFactory.getFilenameSuffix(), write.getShardTemplate());
}
@Test
public void testWriteDisplayData() {
TextIO.Write write = TextIO.write()
.to("/foo")
.withSuffix("bar")
.withShardNameTemplate("-SS-of-NN-")
.withNumShards(100)
.withFooter("myFooter")
.withHeader("myHeader");
DisplayData displayData = DisplayData.from(write);
assertThat(displayData, hasDisplayItem("filePrefix", "/foo"));
assertThat(displayData, hasDisplayItem("fileSuffix", "bar"));
assertThat(displayData, hasDisplayItem("fileHeader", "myHeader"));
assertThat(displayData, hasDisplayItem("fileFooter", "myFooter"));
assertThat(displayData, hasDisplayItem("shardNameTemplate", "-SS-of-NN-"));
assertThat(displayData, hasDisplayItem("numShards", 100));
assertThat(displayData, hasDisplayItem("writableByteChannelFactory", "UNCOMPRESSED"));
}
@Test
public void testWriteDisplayDataValidateThenHeader() {
TextIO.Write write = TextIO.write()
.to("foo")
.withHeader("myHeader");
DisplayData displayData = DisplayData.from(write);
assertThat(displayData, hasDisplayItem("fileHeader", "myHeader"));
}
@Test
public void testWriteDisplayDataValidateThenFooter() {
TextIO.Write write = TextIO.write()
.to("foo")
.withFooter("myFooter");
DisplayData displayData = DisplayData.from(write);
assertThat(displayData, hasDisplayItem("fileFooter", "myFooter"));
}
/** Options for testing. */
public interface RuntimeTestOptions extends PipelineOptions {
ValueProvider<String> getInput();
void setInput(ValueProvider<String> value);
ValueProvider<String> getOutput();
void setOutput(ValueProvider<String> value);
}
@Test
public void testRuntimeOptionsNotCalledInApply() throws Exception {
p.enableAbandonedNodeEnforcement(false);
RuntimeTestOptions options = PipelineOptionsFactory.as(RuntimeTestOptions.class);
p
.apply(TextIO.read().from(options.getInput()))
.apply(TextIO.write().to(options.getOutput()));
}
@Test
public void testCompressionTypeIsSet() throws Exception {
TextIO.Read read = TextIO.read().from("/tmp/test");
assertEquals(AUTO, read.getCompressionType());
read = TextIO.read().from("/tmp/test").withCompressionType(GZIP);
assertEquals(GZIP, read.getCompressionType());
}
/**
* Helper that writes the given lines (adding a newline in between) to a stream, then closes the
* stream.
*/
private static void writeToStreamAndClose(String[] lines, OutputStream outputStream) {
try (PrintStream writer = new PrintStream(outputStream)) {
for (String line : lines) {
writer.println(line);
}
}
}
/**
* Helper method that runs TextIO.read().from(filename).withCompressionType(compressionType)
* and asserts that the results match the given expected output.
*/
private void assertReadingCompressedFileMatchesExpected(
File file, CompressionType compressionType, String[] expected) {
TextIO.Read read =
TextIO.read().from(file.getPath()).withCompressionType(compressionType);
PCollection<String> output = p.apply("Read_" + file + "_" + compressionType.toString(), read);
PAssert.that(output).containsInAnyOrder(expected);
p.run();
}
/**
* Helper to make an array of compressible strings. Returns ["word"i] for i in range(0,n).
*/
private static String[] makeLines(int n) {
String[] ret = new String[n];
for (int i = 0; i < n; ++i) {
ret[i] = "word" + i;
}
return ret;
}
/**
* Tests reading from a small, gzipped file with no .gz extension but GZIP compression set.
*/
@Test
@Category(NeedsRunner.class)
public void testSmallCompressedGzipReadNoExtension() throws Exception {
File smallGzNoExtension = writeToFile(TINY, "tiny_gz_no_extension", GZIP);
assertReadingCompressedFileMatchesExpected(smallGzNoExtension, GZIP, TINY);
}
/**
* Tests reading from a small, uncompressed file with .gz extension. This must work in AUTO or
* GZIP modes. This is needed because some network file systems / HTTP clients will transparently
* decompress gzipped content.
*/
@Test
@Category(NeedsRunner.class)
public void testSmallCompressedGzipReadActuallyUncompressed() throws Exception {
File smallGzNotCompressed =
writeToFile(TINY, "tiny_uncompressed.gz", CompressionType.UNCOMPRESSED);
// Should work with GZIP compression set.
assertReadingCompressedFileMatchesExpected(smallGzNotCompressed, GZIP, TINY);
// Should also work with AUTO mode set.
assertReadingCompressedFileMatchesExpected(smallGzNotCompressed, AUTO, TINY);
}
/**
* Tests reading from a small, bzip2ed file with no .bz2 extension but BZIP2 compression set.
*/
@Test
@Category(NeedsRunner.class)
public void testSmallCompressedBzip2ReadNoExtension() throws Exception {
File smallBz2NoExtension = writeToFile(TINY, "tiny_bz2_no_extension", BZIP2);
assertReadingCompressedFileMatchesExpected(smallBz2NoExtension, BZIP2, TINY);
}
/**
* Create a zip file with the given lines.
*
* @param expected A list of expected lines, populated in the zip file.
* @param filename Optionally zip file name (can be null).
* @param fieldsEntries Fields to write in zip entries.
* @return The zip filename.
* @throws Exception In case of a failure during zip file creation.
*/
private String createZipFile(List<String> expected, String filename, String[]... fieldsEntries)
throws Exception {
File tmpFile = tempFolder.resolve(filename).toFile();
String tmpFileName = tmpFile.getPath();
ZipOutputStream out = new ZipOutputStream(new FileOutputStream(tmpFile));
PrintStream writer = new PrintStream(out, true /* auto-flush on write */);
int index = 0;
for (String[] entry : fieldsEntries) {
out.putNextEntry(new ZipEntry(Integer.toString(index)));
for (String field : entry) {
writer.println(field);
expected.add(field);
}
out.closeEntry();
index++;
}
writer.close();
out.close();
return tmpFileName;
}
@Test
@Category(NeedsRunner.class)
public void testTxtRead() throws Exception {
// Files with non-compressed extensions should work in AUTO and UNCOMPRESSED modes.
for (CompressionType type : new CompressionType[]{AUTO, UNCOMPRESSED}) {
assertReadingCompressedFileMatchesExpected(emptyTxt, type, EMPTY);
assertReadingCompressedFileMatchesExpected(tinyTxt, type, TINY);
assertReadingCompressedFileMatchesExpected(largeTxt, type, LARGE);
}
}
@Test
@Category(NeedsRunner.class)
public void testGzipCompressedRead() throws Exception {
// Files with the right extensions should work in AUTO and GZIP modes.
for (CompressionType type : new CompressionType[]{AUTO, GZIP}) {
assertReadingCompressedFileMatchesExpected(emptyGz, type, EMPTY);
assertReadingCompressedFileMatchesExpected(tinyGz, type, TINY);
assertReadingCompressedFileMatchesExpected(largeGz, type, LARGE);
}
// Sanity check that we're properly testing compression.
assertThat(largeTxt.length(), greaterThan(largeGz.length()));
// GZIP files with non-gz extension should work in GZIP mode.
File gzFile = writeToFile(TINY, "tiny_gz_no_extension", GZIP);
assertReadingCompressedFileMatchesExpected(gzFile, GZIP, TINY);
}
@Test
@Category(NeedsRunner.class)
public void testBzip2CompressedRead() throws Exception {
// Files with the right extensions should work in AUTO and BZIP2 modes.
for (CompressionType type : new CompressionType[]{AUTO, BZIP2}) {
assertReadingCompressedFileMatchesExpected(emptyBzip2, type, EMPTY);
assertReadingCompressedFileMatchesExpected(tinyBzip2, type, TINY);
assertReadingCompressedFileMatchesExpected(largeBzip2, type, LARGE);
}
// Sanity check that we're properly testing compression.
assertThat(largeTxt.length(), greaterThan(largeBzip2.length()));
// BZ2 files with non-bz2 extension should work in BZIP2 mode.
File bz2File = writeToFile(TINY, "tiny_bz2_no_extension", BZIP2);
assertReadingCompressedFileMatchesExpected(bz2File, BZIP2, TINY);
}
@Test
@Category(NeedsRunner.class)
public void testZipCompressedRead() throws Exception {
// Files with the right extensions should work in AUTO and ZIP modes.
for (CompressionType type : new CompressionType[]{AUTO, ZIP}) {
assertReadingCompressedFileMatchesExpected(emptyZip, type, EMPTY);
assertReadingCompressedFileMatchesExpected(tinyZip, type, TINY);
assertReadingCompressedFileMatchesExpected(largeZip, type, LARGE);
}
// Sanity check that we're properly testing compression.
assertThat(largeTxt.length(), greaterThan(largeZip.length()));
// Zip files with non-zip extension should work in ZIP mode.
File zipFile = writeToFile(TINY, "tiny_zip_no_extension", ZIP);
assertReadingCompressedFileMatchesExpected(zipFile, ZIP, TINY);
}
@Test
@Category(NeedsRunner.class)
public void testDeflateCompressedRead() throws Exception {
// Files with the right extensions should work in AUTO and ZIP modes.
for (CompressionType type : new CompressionType[]{AUTO, DEFLATE}) {
assertReadingCompressedFileMatchesExpected(emptyDeflate, type, EMPTY);
assertReadingCompressedFileMatchesExpected(tinyDeflate, type, TINY);
assertReadingCompressedFileMatchesExpected(largeDeflate, type, LARGE);
}
// Sanity check that we're properly testing compression.
assertThat(largeTxt.length(), greaterThan(largeDeflate.length()));
// Deflate files with non-deflate extension should work in DEFLATE mode.
File deflateFile = writeToFile(TINY, "tiny_deflate_no_extension", DEFLATE);
assertReadingCompressedFileMatchesExpected(deflateFile, DEFLATE, TINY);
}
/**
* Tests a zip file with no entries. This is a corner case not tested elsewhere as the default
* test zip files have a single entry.
*/
@Test
@Category(NeedsRunner.class)
public void testZipCompressedReadWithNoEntries() throws Exception {
String filename = createZipFile(new ArrayList<String>(), "empty zip file");
assertReadingCompressedFileMatchesExpected(new File(filename), CompressionType.ZIP, EMPTY);
}
/**
* Tests a zip file with multiple entries. This is a corner case not tested elsewhere as the
* default test zip files have a single entry.
*/
@Test
@Category(NeedsRunner.class)
public void testZipCompressedReadWithMultiEntriesFile() throws Exception {
String[] entry0 = new String[]{"first", "second", "three"};
String[] entry1 = new String[]{"four", "five", "six"};
String[] entry2 = new String[]{"seven", "eight", "nine"};
List<String> expected = new ArrayList<>();
String filename = createZipFile(expected, "multiple entries", entry0, entry1, entry2);
assertReadingCompressedFileMatchesExpected(
new File(filename), CompressionType.ZIP, expected.toArray(new String[]{}));
}
/**
* Read a ZIP compressed file containing data, multiple empty entries, and then more data. We
* expect just the data back.
*/
@Test
@Category(NeedsRunner.class)
public void testZipCompressedReadWithComplexEmptyAndPresentEntries() throws Exception {
String filename = createZipFile(
new ArrayList<String>(),
"complex empty and present entries",
new String[]{"cat"},
new String[]{},
new String[]{},
new String[]{"dog"});
assertReadingCompressedFileMatchesExpected(
new File(filename), CompressionType.ZIP, new String[] {"cat", "dog"});
}
@Test
public void testTextIOGetName() {
assertEquals("TextIO.Read", TextIO.read().from("somefile").getName());
assertEquals("TextIO.Write", TextIO.write().to("somefile").getName());
assertEquals("TextIO.Read", TextIO.read().from("somefile").toString());
}
@Test
public void testProgressEmptyFile() throws IOException {
try (BoundedReader<String> reader =
prepareSource(new byte[0]).createReader(PipelineOptionsFactory.create())) {
// Check preconditions before starting.
assertEquals(0.0, reader.getFractionConsumed(), 1e-6);
assertEquals(0, reader.getSplitPointsConsumed());
assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining());
// Assert empty
assertFalse(reader.start());
// Check postconditions after finishing
assertEquals(1.0, reader.getFractionConsumed(), 1e-6);
assertEquals(0, reader.getSplitPointsConsumed());
assertEquals(0, reader.getSplitPointsRemaining());
}
}
@Test
public void testProgressTextFile() throws IOException {
String file = "line1\nline2\nline3";
try (BoundedReader<String> reader =
prepareSource(file.getBytes()).createReader(PipelineOptionsFactory.create())) {
// Check preconditions before starting
assertEquals(0.0, reader.getFractionConsumed(), 1e-6);
assertEquals(0, reader.getSplitPointsConsumed());
assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining());
// Line 1
assertTrue(reader.start());
assertEquals(0, reader.getSplitPointsConsumed());
assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining());
// Line 2
assertTrue(reader.advance());
assertEquals(1, reader.getSplitPointsConsumed());
assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining());
// Line 3
assertTrue(reader.advance());
assertEquals(2, reader.getSplitPointsConsumed());
assertEquals(1, reader.getSplitPointsRemaining());
// Check postconditions after finishing
assertFalse(reader.advance());
assertEquals(1.0, reader.getFractionConsumed(), 1e-6);
assertEquals(3, reader.getSplitPointsConsumed());
assertEquals(0, reader.getSplitPointsRemaining());
}
}
@Test
public void testProgressAfterSplitting() throws IOException {
String file = "line1\nline2\nline3";
BoundedSource<String> source = prepareSource(file.getBytes());
BoundedSource<String> remainder;
// Create the remainder, verifying properties pre- and post-splitting.
try (BoundedReader<String> readerOrig = source.createReader(PipelineOptionsFactory.create())) {
// Preconditions.
assertEquals(0.0, readerOrig.getFractionConsumed(), 1e-6);
assertEquals(0, readerOrig.getSplitPointsConsumed());
assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, readerOrig.getSplitPointsRemaining());
// First record, before splitting.
assertTrue(readerOrig.start());
assertEquals(0, readerOrig.getSplitPointsConsumed());
assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, readerOrig.getSplitPointsRemaining());
// Split. 0.1 is in line1, so should now be able to detect last record.
remainder = readerOrig.splitAtFraction(0.1);
System.err.println(readerOrig.getCurrentSource());
assertNotNull(remainder);
// First record, after splitting.
assertEquals(0, readerOrig.getSplitPointsConsumed());
assertEquals(1, readerOrig.getSplitPointsRemaining());
// Finish and postconditions.
assertFalse(readerOrig.advance());
assertEquals(1.0, readerOrig.getFractionConsumed(), 1e-6);
assertEquals(1, readerOrig.getSplitPointsConsumed());
assertEquals(0, readerOrig.getSplitPointsRemaining());
}
// Check the properties of the remainder.
try (BoundedReader<String> reader = remainder.createReader(PipelineOptionsFactory.create())) {
// Preconditions.
assertEquals(0.0, reader.getFractionConsumed(), 1e-6);
assertEquals(0, reader.getSplitPointsConsumed());
assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining());
// First record should be line 2.
assertTrue(reader.start());
assertEquals(0, reader.getSplitPointsConsumed());
assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining());
// Second record is line 3
assertTrue(reader.advance());
assertEquals(1, reader.getSplitPointsConsumed());
assertEquals(1, reader.getSplitPointsRemaining());
// Check postconditions after finishing
assertFalse(reader.advance());
assertEquals(1.0, reader.getFractionConsumed(), 1e-6);
assertEquals(2, reader.getSplitPointsConsumed());
assertEquals(0, reader.getSplitPointsRemaining());
}
}
@Test
public void testReadEmptyLines() throws Exception {
runTestReadWithData("\n\n\n".getBytes(StandardCharsets.UTF_8),
ImmutableList.of("", "", ""));
}
@Test
public void testReadFileWithLineFeedDelimiter() throws Exception {
runTestReadWithData("asdf\nhjkl\nxyz\n".getBytes(StandardCharsets.UTF_8),
ImmutableList.of("asdf", "hjkl", "xyz"));
}
@Test
public void testReadFileWithCarriageReturnDelimiter() throws Exception {
runTestReadWithData("asdf\rhjkl\rxyz\r".getBytes(StandardCharsets.UTF_8),
ImmutableList.of("asdf", "hjkl", "xyz"));
}
@Test
public void testReadFileWithCarriageReturnAndLineFeedDelimiter() throws Exception {
runTestReadWithData("asdf\r\nhjkl\r\nxyz\r\n".getBytes(StandardCharsets.UTF_8),
ImmutableList.of("asdf", "hjkl", "xyz"));
}
@Test
public void testReadFileWithMixedDelimiters() throws Exception {
runTestReadWithData("asdf\rhjkl\r\nxyz\n".getBytes(StandardCharsets.UTF_8),
ImmutableList.of("asdf", "hjkl", "xyz"));
}
@Test
public void testReadFileWithLineFeedDelimiterAndNonEmptyBytesAtEnd() throws Exception {
runTestReadWithData("asdf\nhjkl\nxyz".getBytes(StandardCharsets.UTF_8),
ImmutableList.of("asdf", "hjkl", "xyz"));
}
@Test
public void testReadFileWithCarriageReturnDelimiterAndNonEmptyBytesAtEnd() throws Exception {
runTestReadWithData("asdf\rhjkl\rxyz".getBytes(StandardCharsets.UTF_8),
ImmutableList.of("asdf", "hjkl", "xyz"));
}
@Test
public void testReadFileWithCarriageReturnAndLineFeedDelimiterAndNonEmptyBytesAtEnd()
throws Exception {
runTestReadWithData("asdf\r\nhjkl\r\nxyz".getBytes(StandardCharsets.UTF_8),
ImmutableList.of("asdf", "hjkl", "xyz"));
}
@Test
public void testReadFileWithMixedDelimitersAndNonEmptyBytesAtEnd() throws Exception {
runTestReadWithData("asdf\rhjkl\r\nxyz".getBytes(StandardCharsets.UTF_8),
ImmutableList.of("asdf", "hjkl", "xyz"));
}
private void runTestReadWithData(byte[] data, List<String> expectedResults) throws Exception {
TextSource source = prepareSource(data);
List<String> actual = SourceTestUtils.readFromSource(source, PipelineOptionsFactory.create());
assertThat(actual, containsInAnyOrder(new ArrayList<>(expectedResults).toArray(new String[0])));
}
@Test
public void testSplittingSourceWithEmptyLines() throws Exception {
TextSource source = prepareSource("\n\n\n".getBytes(StandardCharsets.UTF_8));
SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
}
@Test
public void testSplittingSourceWithLineFeedDelimiter() throws Exception {
TextSource source = prepareSource("asdf\nhjkl\nxyz\n".getBytes(StandardCharsets.UTF_8));
SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
}
@Test
public void testSplittingSourceWithCarriageReturnDelimiter() throws Exception {
TextSource source = prepareSource("asdf\rhjkl\rxyz\r".getBytes(StandardCharsets.UTF_8));
SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
}
@Test
public void testSplittingSourceWithCarriageReturnAndLineFeedDelimiter() throws Exception {
TextSource source = prepareSource(
"asdf\r\nhjkl\r\nxyz\r\n".getBytes(StandardCharsets.UTF_8));
SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
}
@Test
public void testSplittingSourceWithMixedDelimiters() throws Exception {
TextSource source = prepareSource(
"asdf\rhjkl\r\nxyz\n".getBytes(StandardCharsets.UTF_8));
SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
}
@Test
public void testSplittingSourceWithLineFeedDelimiterAndNonEmptyBytesAtEnd() throws Exception {
TextSource source = prepareSource("asdf\nhjkl\nxyz".getBytes(StandardCharsets.UTF_8));
SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
}
@Test
public void testSplittingSourceWithCarriageReturnDelimiterAndNonEmptyBytesAtEnd()
throws Exception {
TextSource source = prepareSource("asdf\rhjkl\rxyz".getBytes(StandardCharsets.UTF_8));
SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
}
@Test
public void testSplittingSourceWithCarriageReturnAndLineFeedDelimiterAndNonEmptyBytesAtEnd()
throws Exception {
TextSource source = prepareSource(
"asdf\r\nhjkl\r\nxyz".getBytes(StandardCharsets.UTF_8));
SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
}
@Test
public void testSplittingSourceWithMixedDelimitersAndNonEmptyBytesAtEnd() throws Exception {
TextSource source = prepareSource("asdf\rhjkl\r\nxyz".getBytes(StandardCharsets.UTF_8));
SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
}
private TextSource prepareSource(byte[] data) throws IOException {
Path path = Files.createTempFile(tempFolder, "tempfile", "ext");
Files.write(path, data);
return new TextSource(ValueProvider.StaticValueProvider.of(path.toString()));
}
@Test
public void testInitialSplitAutoModeTxt() throws Exception {
PipelineOptions options = TestPipeline.testingPipelineOptions();
long desiredBundleSize = 1000;
// Sanity check: file is at least 2 bundles long.
assertThat(largeTxt.length(), greaterThan(2 * desiredBundleSize));
FileBasedSource<String> source = TextIO.read().from(largeTxt.getPath()).getSource();
List<? extends FileBasedSource<String>> splits =
source.split(desiredBundleSize, options);
// At least 2 splits and they are equal to reading the whole file.
assertThat(splits, hasSize(greaterThan(1)));
SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
}
@Test
public void testInitialSplitAutoModeGz() throws Exception {
long desiredBundleSize = 1000;
PipelineOptions options = TestPipeline.testingPipelineOptions();
// Sanity check: file is at least 2 bundles long.
assertThat(largeGz.length(), greaterThan(2 * desiredBundleSize));
FileBasedSource<String> source = TextIO.read().from(largeGz.getPath()).getSource();
List<? extends FileBasedSource<String>> splits =
source.split(desiredBundleSize, options);
// Exactly 1 split, even in AUTO mode, since it is a gzip file.
assertThat(splits, hasSize(equalTo(1)));
SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
}
@Test
public void testInitialSplitGzipModeTxt() throws Exception {
PipelineOptions options = TestPipeline.testingPipelineOptions();
long desiredBundleSize = 1000;
// Sanity check: file is at least 2 bundles long.
assertThat(largeTxt.length(), greaterThan(2 * desiredBundleSize));
FileBasedSource<String> source =
TextIO.read().from(largeTxt.getPath()).withCompressionType(GZIP).getSource();
List<? extends FileBasedSource<String>> splits =
source.split(desiredBundleSize, options);
// Exactly 1 split, even though splittable text file, since using GZIP mode.
assertThat(splits, hasSize(equalTo(1)));
SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
}
@Test
public void testInitialSplitGzipModeGz() throws Exception {
PipelineOptions options = TestPipeline.testingPipelineOptions();
long desiredBundleSize = 1000;
// Sanity check: file is at least 2 bundles long.
assertThat(largeGz.length(), greaterThan(2 * desiredBundleSize));
FileBasedSource<String> source =
TextIO.read().from(largeGz.getPath()).withCompressionType(GZIP).getSource();
List<? extends FileBasedSource<String>> splits =
source.split(desiredBundleSize, options);
// Exactly 1 split using .gz extension and using GZIP mode.
assertThat(splits, hasSize(equalTo(1)));
SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
}
@Test
public void testWindowedWriteRequiresFilenamePolicy() {
PCollection<String> emptyInput = p.apply(Create.empty(StringUtf8Coder.of()));
TextIO.Write write = TextIO.write().to("/tmp/some/file").withWindowedWrites();
expectedException.expect(IllegalStateException.class);
expectedException.expectMessage(
"When using windowed writes, a filename policy must be set via withFilenamePolicy()");
emptyInput.apply(write);
}
}