/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io; import static com.google.common.base.MoreObjects.firstNonNull; import static org.apache.beam.sdk.TestUtils.LINES2_ARRAY; import static org.apache.beam.sdk.TestUtils.LINES_ARRAY; import static org.apache.beam.sdk.TestUtils.NO_LINES_ARRAY; import static org.apache.beam.sdk.io.TextIO.CompressionType.AUTO; import static org.apache.beam.sdk.io.TextIO.CompressionType.BZIP2; import static org.apache.beam.sdk.io.TextIO.CompressionType.DEFLATE; import static org.apache.beam.sdk.io.TextIO.CompressionType.GZIP; import static org.apache.beam.sdk.io.TextIO.CompressionType.UNCOMPRESSED; import static org.apache.beam.sdk.io.TextIO.CompressionType.ZIP; import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem; import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasValue; import static org.hamcrest.Matchers.containsInAnyOrder; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.hasItem; import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.startsWith; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; import com.google.common.base.Function; import com.google.common.base.Predicate; import com.google.common.collect.FluentIterable; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.OutputStream; import java.io.PrintStream; import java.nio.charset.StandardCharsets; import java.nio.file.FileVisitResult; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.SimpleFileVisitor; import java.nio.file.attribute.BasicFileAttributes; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Set; import java.util.zip.GZIPOutputStream; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; import javax.annotation.Nullable; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.io.BoundedSource.BoundedReader; import org.apache.beam.sdk.io.FileBasedSink.WritableByteChannelFactory; import org.apache.beam.sdk.io.TextIO.CompressionType; import org.apache.beam.sdk.io.fs.MatchResult; import org.apache.beam.sdk.io.fs.MatchResult.Metadata; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.ValueProvider; import org.apache.beam.sdk.testing.NeedsRunner; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.SourceTestUtils; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.testing.ValidatesRunner; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.display.DisplayData; import org.apache.beam.sdk.transforms.display.DisplayDataEvaluator; import org.apache.beam.sdk.util.CoderUtils; import org.apache.beam.sdk.values.PCollection; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; import org.apache.commons.compress.compressors.deflate.DeflateCompressorOutputStream; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; import org.junit.experimental.categories.Category; import org.junit.rules.ExpectedException; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; /** * Tests for {@link TextIO} {@link TextIO.Read} and {@link TextIO.Write} transforms. */ // TODO: Change the tests to use ValidatesRunner instead of NeedsRunner @RunWith(JUnit4.class) @SuppressWarnings("unchecked") public class TextIOTest { private static final String MY_HEADER = "myHeader"; private static final String MY_FOOTER = "myFooter"; private static final String[] EMPTY = new String[] {}; private static final String[] TINY = new String[] {"Irritable eagle", "Optimistic jay", "Fanciful hawk"}; private static final String[] LARGE = makeLines(1000); private static Path tempFolder; private static File emptyTxt; private static File tinyTxt; private static File largeTxt; private static File emptyGz; private static File tinyGz; private static File largeGz; private static File emptyBzip2; private static File tinyBzip2; private static File largeBzip2; private static File emptyZip; private static File tinyZip; private static File largeZip; private static File emptyDeflate; private static File tinyDeflate; private static File largeDeflate; @Rule public TestPipeline p = TestPipeline.create(); @Rule public ExpectedException expectedException = ExpectedException.none(); private static File writeToFile(String[] lines, String filename, CompressionType compression) throws IOException { File file = tempFolder.resolve(filename).toFile(); OutputStream output = new FileOutputStream(file); switch (compression) { case UNCOMPRESSED: break; case GZIP: output = new GZIPOutputStream(output); break; case BZIP2: output = new BZip2CompressorOutputStream(output); break; case ZIP: ZipOutputStream zipOutput = new ZipOutputStream(output); zipOutput.putNextEntry(new ZipEntry("entry")); output = zipOutput; break; case DEFLATE: output = new DeflateCompressorOutputStream(output); break; default: throw new UnsupportedOperationException(compression.toString()); } writeToStreamAndClose(lines, output); return file; } @BeforeClass public static void setupClass() throws IOException { tempFolder = Files.createTempDirectory("TextIOTest"); // empty files emptyTxt = writeToFile(EMPTY, "empty.txt", CompressionType.UNCOMPRESSED); emptyGz = writeToFile(EMPTY, "empty.gz", GZIP); emptyBzip2 = writeToFile(EMPTY, "empty.bz2", BZIP2); emptyZip = writeToFile(EMPTY, "empty.zip", ZIP); emptyDeflate = writeToFile(EMPTY, "empty.deflate", DEFLATE); // tiny files tinyTxt = writeToFile(TINY, "tiny.txt", CompressionType.UNCOMPRESSED); tinyGz = writeToFile(TINY, "tiny.gz", GZIP); tinyBzip2 = writeToFile(TINY, "tiny.bz2", BZIP2); tinyZip = writeToFile(TINY, "tiny.zip", ZIP); tinyDeflate = writeToFile(TINY, "tiny.deflate", DEFLATE); // large files largeTxt = writeToFile(LARGE, "large.txt", CompressionType.UNCOMPRESSED); largeGz = writeToFile(LARGE, "large.gz", GZIP); largeBzip2 = writeToFile(LARGE, "large.bz2", BZIP2); largeZip = writeToFile(LARGE, "large.zip", ZIP); largeDeflate = writeToFile(LARGE, "large.deflate", DEFLATE); } @AfterClass public static void teardownClass() throws IOException { Files.walkFileTree(tempFolder, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { Files.delete(file); return FileVisitResult.CONTINUE; } @Override public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { Files.delete(dir); return FileVisitResult.CONTINUE; } }); } private <T> void runTestRead(String[] expected) throws Exception { File tmpFile = Files.createTempFile(tempFolder, "file", "txt").toFile(); String filename = tmpFile.getPath(); try (PrintStream writer = new PrintStream(new FileOutputStream(tmpFile))) { for (String elem : expected) { byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem); String line = new String(encodedElem); writer.println(line); } } TextIO.Read read = TextIO.read().from(filename); PCollection<String> output = p.apply(read); PAssert.that(output).containsInAnyOrder(expected); p.run(); } @Test @Category(NeedsRunner.class) public void testReadStrings() throws Exception { runTestRead(LINES_ARRAY); } @Test @Category(NeedsRunner.class) public void testReadEmptyStrings() throws Exception { runTestRead(NO_LINES_ARRAY); } @Test public void testReadNamed() throws Exception { p.enableAbandonedNodeEnforcement(false); assertEquals( "TextIO.Read/Read.out", p.apply(TextIO.read().from("somefile")).getName()); assertEquals( "MyRead/Read.out", p.apply("MyRead", TextIO.read().from(emptyTxt.getPath())).getName()); } @Test public void testReadDisplayData() { TextIO.Read read = TextIO.read() .from("foo.*") .withCompressionType(BZIP2); DisplayData displayData = DisplayData.from(read); assertThat(displayData, hasDisplayItem("filePattern", "foo.*")); assertThat(displayData, hasDisplayItem("compressionType", BZIP2.toString())); } @Test @Category(ValidatesRunner.class) public void testPrimitiveReadDisplayData() { DisplayDataEvaluator evaluator = DisplayDataEvaluator.create(); TextIO.Read read = TextIO.read() .from("foobar"); Set<DisplayData> displayData = evaluator.displayDataForPrimitiveSourceTransforms(read); assertThat("TextIO.Read should include the file prefix in its primitive display data", displayData, hasItem(hasDisplayItem(hasValue(startsWith("foobar"))))); } private void runTestWrite(String[] elems) throws Exception { runTestWrite(elems, null, null, 1); } private void runTestWrite(String[] elems, int numShards) throws Exception { runTestWrite(elems, null, null, numShards); } private void runTestWrite(String[] elems, String header, String footer) throws Exception { runTestWrite(elems, header, footer, 1); } private void runTestWrite( String[] elems, String header, String footer, int numShards) throws Exception { String outputName = "file.txt"; Path baseDir = Files.createTempDirectory(tempFolder, "testwrite"); String baseFilename = baseDir.resolve(outputName).toString(); PCollection<String> input = p.apply(Create.of(Arrays.asList(elems)).withCoder(StringUtf8Coder.of())); TextIO.Write write = TextIO.write().to(baseFilename) .withHeader(header) .withFooter(footer); if (numShards == 1) { write = write.withoutSharding(); } else if (numShards > 0) { write = write.withNumShards(numShards).withShardNameTemplate(ShardNameTemplate.INDEX_OF_MAX); } input.apply(write); p.run(); assertOutputFiles(elems, header, footer, numShards, baseDir, outputName, firstNonNull(write.getShardTemplate(), DefaultFilenamePolicy.DEFAULT_SHARD_TEMPLATE)); } public static void assertOutputFiles( String[] elems, final String header, final String footer, int numShards, Path rootLocation, String outputName, String shardNameTemplate) throws Exception { List<File> expectedFiles = new ArrayList<>(); if (numShards == 0) { String pattern = rootLocation.toAbsolutePath().resolve(outputName + "*").toString(); List<MatchResult> matches = FileSystems.match(Collections.singletonList(pattern)); for (Metadata expectedFile : Iterables.getOnlyElement(matches).metadata()) { expectedFiles.add(new File(expectedFile.resourceId().toString())); } } else { for (int i = 0; i < numShards; i++) { expectedFiles.add( new File( rootLocation.toString(), DefaultFilenamePolicy.constructName( outputName, shardNameTemplate, "", i, numShards))); } } List<List<String>> actual = new ArrayList<>(); for (File tmpFile : expectedFiles) { try (BufferedReader reader = new BufferedReader(new FileReader(tmpFile))) { List<String> currentFile = new ArrayList<>(); for (;;) { String line = reader.readLine(); if (line == null) { break; } currentFile.add(line); } actual.add(currentFile); } } List<String> expectedElements = new ArrayList<>(elems.length); for (String elem : elems) { byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem); String line = new String(encodedElem); expectedElements.add(line); } List<String> actualElements = Lists.newArrayList( Iterables.concat( FluentIterable .from(actual) .transform(removeHeaderAndFooter(header, footer)) .toList())); assertThat(actualElements, containsInAnyOrder(expectedElements.toArray())); assertTrue(Iterables.all(actual, haveProperHeaderAndFooter(header, footer))); } private static Function<List<String>, List<String>> removeHeaderAndFooter(final String header, final String footer) { return new Function<List<String>, List<String>>() { @Nullable @Override public List<String> apply(List<String> lines) { ArrayList<String> newLines = Lists.newArrayList(lines); if (header != null) { newLines.remove(0); } if (footer != null) { int last = newLines.size() - 1; newLines.remove(last); } return newLines; } }; } private static Predicate<List<String>> haveProperHeaderAndFooter(final String header, final String footer) { return new Predicate<List<String>>() { @Override public boolean apply(List<String> fileLines) { int last = fileLines.size() - 1; return (header == null || fileLines.get(0).equals(header)) && (footer == null || fileLines.get(last).equals(footer)); } }; } @Test @Category(NeedsRunner.class) public void testWriteStrings() throws Exception { runTestWrite(LINES_ARRAY); } @Test @Category(NeedsRunner.class) public void testWriteEmptyStringsNoSharding() throws Exception { runTestWrite(NO_LINES_ARRAY, 0); } @Test @Category(NeedsRunner.class) public void testWriteEmptyStrings() throws Exception { runTestWrite(NO_LINES_ARRAY); } @Test @Category(NeedsRunner.class) public void testShardedWrite() throws Exception { runTestWrite(LINES_ARRAY, 5); } @Test @Category(NeedsRunner.class) public void testWriteWithHeader() throws Exception { runTestWrite(LINES_ARRAY, MY_HEADER, null); } @Test @Category(NeedsRunner.class) public void testWriteWithFooter() throws Exception { runTestWrite(LINES_ARRAY, null, MY_FOOTER); } @Test @Category(NeedsRunner.class) public void testWriteWithHeaderAndFooter() throws Exception { runTestWrite(LINES_ARRAY, MY_HEADER, MY_FOOTER); } @Test @Category(NeedsRunner.class) public void testWriteWithWritableByteChannelFactory() throws Exception { Coder<String> coder = StringUtf8Coder.of(); String outputName = "file.txt"; Path baseDir = Files.createTempDirectory(tempFolder, "testwrite"); PCollection<String> input = p.apply(Create.of(Arrays.asList(LINES2_ARRAY)).withCoder(coder)); final WritableByteChannelFactory writableByteChannelFactory = new DrunkWritableByteChannelFactory(); TextIO.Write write = TextIO.write().to(baseDir.resolve(outputName).toString()) .withoutSharding().withWritableByteChannelFactory(writableByteChannelFactory); DisplayData displayData = DisplayData.from(write); assertThat(displayData, hasDisplayItem("writableByteChannelFactory", "DRUNK")); input.apply(write); p.run(); final List<String> drunkElems = new ArrayList<>(LINES2_ARRAY.length * 2 + 2); for (String elem : LINES2_ARRAY) { drunkElems.add(elem); drunkElems.add(elem); } assertOutputFiles(drunkElems.toArray(new String[0]), null, null, 1, baseDir, outputName + writableByteChannelFactory.getFilenameSuffix(), write.getShardTemplate()); } @Test public void testWriteDisplayData() { TextIO.Write write = TextIO.write() .to("/foo") .withSuffix("bar") .withShardNameTemplate("-SS-of-NN-") .withNumShards(100) .withFooter("myFooter") .withHeader("myHeader"); DisplayData displayData = DisplayData.from(write); assertThat(displayData, hasDisplayItem("filePrefix", "/foo")); assertThat(displayData, hasDisplayItem("fileSuffix", "bar")); assertThat(displayData, hasDisplayItem("fileHeader", "myHeader")); assertThat(displayData, hasDisplayItem("fileFooter", "myFooter")); assertThat(displayData, hasDisplayItem("shardNameTemplate", "-SS-of-NN-")); assertThat(displayData, hasDisplayItem("numShards", 100)); assertThat(displayData, hasDisplayItem("writableByteChannelFactory", "UNCOMPRESSED")); } @Test public void testWriteDisplayDataValidateThenHeader() { TextIO.Write write = TextIO.write() .to("foo") .withHeader("myHeader"); DisplayData displayData = DisplayData.from(write); assertThat(displayData, hasDisplayItem("fileHeader", "myHeader")); } @Test public void testWriteDisplayDataValidateThenFooter() { TextIO.Write write = TextIO.write() .to("foo") .withFooter("myFooter"); DisplayData displayData = DisplayData.from(write); assertThat(displayData, hasDisplayItem("fileFooter", "myFooter")); } /** Options for testing. */ public interface RuntimeTestOptions extends PipelineOptions { ValueProvider<String> getInput(); void setInput(ValueProvider<String> value); ValueProvider<String> getOutput(); void setOutput(ValueProvider<String> value); } @Test public void testRuntimeOptionsNotCalledInApply() throws Exception { p.enableAbandonedNodeEnforcement(false); RuntimeTestOptions options = PipelineOptionsFactory.as(RuntimeTestOptions.class); p .apply(TextIO.read().from(options.getInput())) .apply(TextIO.write().to(options.getOutput())); } @Test public void testCompressionTypeIsSet() throws Exception { TextIO.Read read = TextIO.read().from("/tmp/test"); assertEquals(AUTO, read.getCompressionType()); read = TextIO.read().from("/tmp/test").withCompressionType(GZIP); assertEquals(GZIP, read.getCompressionType()); } /** * Helper that writes the given lines (adding a newline in between) to a stream, then closes the * stream. */ private static void writeToStreamAndClose(String[] lines, OutputStream outputStream) { try (PrintStream writer = new PrintStream(outputStream)) { for (String line : lines) { writer.println(line); } } } /** * Helper method that runs TextIO.read().from(filename).withCompressionType(compressionType) * and asserts that the results match the given expected output. */ private void assertReadingCompressedFileMatchesExpected( File file, CompressionType compressionType, String[] expected) { TextIO.Read read = TextIO.read().from(file.getPath()).withCompressionType(compressionType); PCollection<String> output = p.apply("Read_" + file + "_" + compressionType.toString(), read); PAssert.that(output).containsInAnyOrder(expected); p.run(); } /** * Helper to make an array of compressible strings. Returns ["word"i] for i in range(0,n). */ private static String[] makeLines(int n) { String[] ret = new String[n]; for (int i = 0; i < n; ++i) { ret[i] = "word" + i; } return ret; } /** * Tests reading from a small, gzipped file with no .gz extension but GZIP compression set. */ @Test @Category(NeedsRunner.class) public void testSmallCompressedGzipReadNoExtension() throws Exception { File smallGzNoExtension = writeToFile(TINY, "tiny_gz_no_extension", GZIP); assertReadingCompressedFileMatchesExpected(smallGzNoExtension, GZIP, TINY); } /** * Tests reading from a small, uncompressed file with .gz extension. This must work in AUTO or * GZIP modes. This is needed because some network file systems / HTTP clients will transparently * decompress gzipped content. */ @Test @Category(NeedsRunner.class) public void testSmallCompressedGzipReadActuallyUncompressed() throws Exception { File smallGzNotCompressed = writeToFile(TINY, "tiny_uncompressed.gz", CompressionType.UNCOMPRESSED); // Should work with GZIP compression set. assertReadingCompressedFileMatchesExpected(smallGzNotCompressed, GZIP, TINY); // Should also work with AUTO mode set. assertReadingCompressedFileMatchesExpected(smallGzNotCompressed, AUTO, TINY); } /** * Tests reading from a small, bzip2ed file with no .bz2 extension but BZIP2 compression set. */ @Test @Category(NeedsRunner.class) public void testSmallCompressedBzip2ReadNoExtension() throws Exception { File smallBz2NoExtension = writeToFile(TINY, "tiny_bz2_no_extension", BZIP2); assertReadingCompressedFileMatchesExpected(smallBz2NoExtension, BZIP2, TINY); } /** * Create a zip file with the given lines. * * @param expected A list of expected lines, populated in the zip file. * @param filename Optionally zip file name (can be null). * @param fieldsEntries Fields to write in zip entries. * @return The zip filename. * @throws Exception In case of a failure during zip file creation. */ private String createZipFile(List<String> expected, String filename, String[]... fieldsEntries) throws Exception { File tmpFile = tempFolder.resolve(filename).toFile(); String tmpFileName = tmpFile.getPath(); ZipOutputStream out = new ZipOutputStream(new FileOutputStream(tmpFile)); PrintStream writer = new PrintStream(out, true /* auto-flush on write */); int index = 0; for (String[] entry : fieldsEntries) { out.putNextEntry(new ZipEntry(Integer.toString(index))); for (String field : entry) { writer.println(field); expected.add(field); } out.closeEntry(); index++; } writer.close(); out.close(); return tmpFileName; } @Test @Category(NeedsRunner.class) public void testTxtRead() throws Exception { // Files with non-compressed extensions should work in AUTO and UNCOMPRESSED modes. for (CompressionType type : new CompressionType[]{AUTO, UNCOMPRESSED}) { assertReadingCompressedFileMatchesExpected(emptyTxt, type, EMPTY); assertReadingCompressedFileMatchesExpected(tinyTxt, type, TINY); assertReadingCompressedFileMatchesExpected(largeTxt, type, LARGE); } } @Test @Category(NeedsRunner.class) public void testGzipCompressedRead() throws Exception { // Files with the right extensions should work in AUTO and GZIP modes. for (CompressionType type : new CompressionType[]{AUTO, GZIP}) { assertReadingCompressedFileMatchesExpected(emptyGz, type, EMPTY); assertReadingCompressedFileMatchesExpected(tinyGz, type, TINY); assertReadingCompressedFileMatchesExpected(largeGz, type, LARGE); } // Sanity check that we're properly testing compression. assertThat(largeTxt.length(), greaterThan(largeGz.length())); // GZIP files with non-gz extension should work in GZIP mode. File gzFile = writeToFile(TINY, "tiny_gz_no_extension", GZIP); assertReadingCompressedFileMatchesExpected(gzFile, GZIP, TINY); } @Test @Category(NeedsRunner.class) public void testBzip2CompressedRead() throws Exception { // Files with the right extensions should work in AUTO and BZIP2 modes. for (CompressionType type : new CompressionType[]{AUTO, BZIP2}) { assertReadingCompressedFileMatchesExpected(emptyBzip2, type, EMPTY); assertReadingCompressedFileMatchesExpected(tinyBzip2, type, TINY); assertReadingCompressedFileMatchesExpected(largeBzip2, type, LARGE); } // Sanity check that we're properly testing compression. assertThat(largeTxt.length(), greaterThan(largeBzip2.length())); // BZ2 files with non-bz2 extension should work in BZIP2 mode. File bz2File = writeToFile(TINY, "tiny_bz2_no_extension", BZIP2); assertReadingCompressedFileMatchesExpected(bz2File, BZIP2, TINY); } @Test @Category(NeedsRunner.class) public void testZipCompressedRead() throws Exception { // Files with the right extensions should work in AUTO and ZIP modes. for (CompressionType type : new CompressionType[]{AUTO, ZIP}) { assertReadingCompressedFileMatchesExpected(emptyZip, type, EMPTY); assertReadingCompressedFileMatchesExpected(tinyZip, type, TINY); assertReadingCompressedFileMatchesExpected(largeZip, type, LARGE); } // Sanity check that we're properly testing compression. assertThat(largeTxt.length(), greaterThan(largeZip.length())); // Zip files with non-zip extension should work in ZIP mode. File zipFile = writeToFile(TINY, "tiny_zip_no_extension", ZIP); assertReadingCompressedFileMatchesExpected(zipFile, ZIP, TINY); } @Test @Category(NeedsRunner.class) public void testDeflateCompressedRead() throws Exception { // Files with the right extensions should work in AUTO and ZIP modes. for (CompressionType type : new CompressionType[]{AUTO, DEFLATE}) { assertReadingCompressedFileMatchesExpected(emptyDeflate, type, EMPTY); assertReadingCompressedFileMatchesExpected(tinyDeflate, type, TINY); assertReadingCompressedFileMatchesExpected(largeDeflate, type, LARGE); } // Sanity check that we're properly testing compression. assertThat(largeTxt.length(), greaterThan(largeDeflate.length())); // Deflate files with non-deflate extension should work in DEFLATE mode. File deflateFile = writeToFile(TINY, "tiny_deflate_no_extension", DEFLATE); assertReadingCompressedFileMatchesExpected(deflateFile, DEFLATE, TINY); } /** * Tests a zip file with no entries. This is a corner case not tested elsewhere as the default * test zip files have a single entry. */ @Test @Category(NeedsRunner.class) public void testZipCompressedReadWithNoEntries() throws Exception { String filename = createZipFile(new ArrayList<String>(), "empty zip file"); assertReadingCompressedFileMatchesExpected(new File(filename), CompressionType.ZIP, EMPTY); } /** * Tests a zip file with multiple entries. This is a corner case not tested elsewhere as the * default test zip files have a single entry. */ @Test @Category(NeedsRunner.class) public void testZipCompressedReadWithMultiEntriesFile() throws Exception { String[] entry0 = new String[]{"first", "second", "three"}; String[] entry1 = new String[]{"four", "five", "six"}; String[] entry2 = new String[]{"seven", "eight", "nine"}; List<String> expected = new ArrayList<>(); String filename = createZipFile(expected, "multiple entries", entry0, entry1, entry2); assertReadingCompressedFileMatchesExpected( new File(filename), CompressionType.ZIP, expected.toArray(new String[]{})); } /** * Read a ZIP compressed file containing data, multiple empty entries, and then more data. We * expect just the data back. */ @Test @Category(NeedsRunner.class) public void testZipCompressedReadWithComplexEmptyAndPresentEntries() throws Exception { String filename = createZipFile( new ArrayList<String>(), "complex empty and present entries", new String[]{"cat"}, new String[]{}, new String[]{}, new String[]{"dog"}); assertReadingCompressedFileMatchesExpected( new File(filename), CompressionType.ZIP, new String[] {"cat", "dog"}); } @Test public void testTextIOGetName() { assertEquals("TextIO.Read", TextIO.read().from("somefile").getName()); assertEquals("TextIO.Write", TextIO.write().to("somefile").getName()); assertEquals("TextIO.Read", TextIO.read().from("somefile").toString()); } @Test public void testProgressEmptyFile() throws IOException { try (BoundedReader<String> reader = prepareSource(new byte[0]).createReader(PipelineOptionsFactory.create())) { // Check preconditions before starting. assertEquals(0.0, reader.getFractionConsumed(), 1e-6); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // Assert empty assertFalse(reader.start()); // Check postconditions after finishing assertEquals(1.0, reader.getFractionConsumed(), 1e-6); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(0, reader.getSplitPointsRemaining()); } } @Test public void testProgressTextFile() throws IOException { String file = "line1\nline2\nline3"; try (BoundedReader<String> reader = prepareSource(file.getBytes()).createReader(PipelineOptionsFactory.create())) { // Check preconditions before starting assertEquals(0.0, reader.getFractionConsumed(), 1e-6); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // Line 1 assertTrue(reader.start()); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // Line 2 assertTrue(reader.advance()); assertEquals(1, reader.getSplitPointsConsumed()); assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // Line 3 assertTrue(reader.advance()); assertEquals(2, reader.getSplitPointsConsumed()); assertEquals(1, reader.getSplitPointsRemaining()); // Check postconditions after finishing assertFalse(reader.advance()); assertEquals(1.0, reader.getFractionConsumed(), 1e-6); assertEquals(3, reader.getSplitPointsConsumed()); assertEquals(0, reader.getSplitPointsRemaining()); } } @Test public void testProgressAfterSplitting() throws IOException { String file = "line1\nline2\nline3"; BoundedSource<String> source = prepareSource(file.getBytes()); BoundedSource<String> remainder; // Create the remainder, verifying properties pre- and post-splitting. try (BoundedReader<String> readerOrig = source.createReader(PipelineOptionsFactory.create())) { // Preconditions. assertEquals(0.0, readerOrig.getFractionConsumed(), 1e-6); assertEquals(0, readerOrig.getSplitPointsConsumed()); assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, readerOrig.getSplitPointsRemaining()); // First record, before splitting. assertTrue(readerOrig.start()); assertEquals(0, readerOrig.getSplitPointsConsumed()); assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, readerOrig.getSplitPointsRemaining()); // Split. 0.1 is in line1, so should now be able to detect last record. remainder = readerOrig.splitAtFraction(0.1); System.err.println(readerOrig.getCurrentSource()); assertNotNull(remainder); // First record, after splitting. assertEquals(0, readerOrig.getSplitPointsConsumed()); assertEquals(1, readerOrig.getSplitPointsRemaining()); // Finish and postconditions. assertFalse(readerOrig.advance()); assertEquals(1.0, readerOrig.getFractionConsumed(), 1e-6); assertEquals(1, readerOrig.getSplitPointsConsumed()); assertEquals(0, readerOrig.getSplitPointsRemaining()); } // Check the properties of the remainder. try (BoundedReader<String> reader = remainder.createReader(PipelineOptionsFactory.create())) { // Preconditions. assertEquals(0.0, reader.getFractionConsumed(), 1e-6); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // First record should be line 2. assertTrue(reader.start()); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // Second record is line 3 assertTrue(reader.advance()); assertEquals(1, reader.getSplitPointsConsumed()); assertEquals(1, reader.getSplitPointsRemaining()); // Check postconditions after finishing assertFalse(reader.advance()); assertEquals(1.0, reader.getFractionConsumed(), 1e-6); assertEquals(2, reader.getSplitPointsConsumed()); assertEquals(0, reader.getSplitPointsRemaining()); } } @Test public void testReadEmptyLines() throws Exception { runTestReadWithData("\n\n\n".getBytes(StandardCharsets.UTF_8), ImmutableList.of("", "", "")); } @Test public void testReadFileWithLineFeedDelimiter() throws Exception { runTestReadWithData("asdf\nhjkl\nxyz\n".getBytes(StandardCharsets.UTF_8), ImmutableList.of("asdf", "hjkl", "xyz")); } @Test public void testReadFileWithCarriageReturnDelimiter() throws Exception { runTestReadWithData("asdf\rhjkl\rxyz\r".getBytes(StandardCharsets.UTF_8), ImmutableList.of("asdf", "hjkl", "xyz")); } @Test public void testReadFileWithCarriageReturnAndLineFeedDelimiter() throws Exception { runTestReadWithData("asdf\r\nhjkl\r\nxyz\r\n".getBytes(StandardCharsets.UTF_8), ImmutableList.of("asdf", "hjkl", "xyz")); } @Test public void testReadFileWithMixedDelimiters() throws Exception { runTestReadWithData("asdf\rhjkl\r\nxyz\n".getBytes(StandardCharsets.UTF_8), ImmutableList.of("asdf", "hjkl", "xyz")); } @Test public void testReadFileWithLineFeedDelimiterAndNonEmptyBytesAtEnd() throws Exception { runTestReadWithData("asdf\nhjkl\nxyz".getBytes(StandardCharsets.UTF_8), ImmutableList.of("asdf", "hjkl", "xyz")); } @Test public void testReadFileWithCarriageReturnDelimiterAndNonEmptyBytesAtEnd() throws Exception { runTestReadWithData("asdf\rhjkl\rxyz".getBytes(StandardCharsets.UTF_8), ImmutableList.of("asdf", "hjkl", "xyz")); } @Test public void testReadFileWithCarriageReturnAndLineFeedDelimiterAndNonEmptyBytesAtEnd() throws Exception { runTestReadWithData("asdf\r\nhjkl\r\nxyz".getBytes(StandardCharsets.UTF_8), ImmutableList.of("asdf", "hjkl", "xyz")); } @Test public void testReadFileWithMixedDelimitersAndNonEmptyBytesAtEnd() throws Exception { runTestReadWithData("asdf\rhjkl\r\nxyz".getBytes(StandardCharsets.UTF_8), ImmutableList.of("asdf", "hjkl", "xyz")); } private void runTestReadWithData(byte[] data, List<String> expectedResults) throws Exception { TextSource source = prepareSource(data); List<String> actual = SourceTestUtils.readFromSource(source, PipelineOptionsFactory.create()); assertThat(actual, containsInAnyOrder(new ArrayList<>(expectedResults).toArray(new String[0]))); } @Test public void testSplittingSourceWithEmptyLines() throws Exception { TextSource source = prepareSource("\n\n\n".getBytes(StandardCharsets.UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } @Test public void testSplittingSourceWithLineFeedDelimiter() throws Exception { TextSource source = prepareSource("asdf\nhjkl\nxyz\n".getBytes(StandardCharsets.UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } @Test public void testSplittingSourceWithCarriageReturnDelimiter() throws Exception { TextSource source = prepareSource("asdf\rhjkl\rxyz\r".getBytes(StandardCharsets.UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } @Test public void testSplittingSourceWithCarriageReturnAndLineFeedDelimiter() throws Exception { TextSource source = prepareSource( "asdf\r\nhjkl\r\nxyz\r\n".getBytes(StandardCharsets.UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } @Test public void testSplittingSourceWithMixedDelimiters() throws Exception { TextSource source = prepareSource( "asdf\rhjkl\r\nxyz\n".getBytes(StandardCharsets.UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } @Test public void testSplittingSourceWithLineFeedDelimiterAndNonEmptyBytesAtEnd() throws Exception { TextSource source = prepareSource("asdf\nhjkl\nxyz".getBytes(StandardCharsets.UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } @Test public void testSplittingSourceWithCarriageReturnDelimiterAndNonEmptyBytesAtEnd() throws Exception { TextSource source = prepareSource("asdf\rhjkl\rxyz".getBytes(StandardCharsets.UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } @Test public void testSplittingSourceWithCarriageReturnAndLineFeedDelimiterAndNonEmptyBytesAtEnd() throws Exception { TextSource source = prepareSource( "asdf\r\nhjkl\r\nxyz".getBytes(StandardCharsets.UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } @Test public void testSplittingSourceWithMixedDelimitersAndNonEmptyBytesAtEnd() throws Exception { TextSource source = prepareSource("asdf\rhjkl\r\nxyz".getBytes(StandardCharsets.UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } private TextSource prepareSource(byte[] data) throws IOException { Path path = Files.createTempFile(tempFolder, "tempfile", "ext"); Files.write(path, data); return new TextSource(ValueProvider.StaticValueProvider.of(path.toString())); } @Test public void testInitialSplitAutoModeTxt() throws Exception { PipelineOptions options = TestPipeline.testingPipelineOptions(); long desiredBundleSize = 1000; // Sanity check: file is at least 2 bundles long. assertThat(largeTxt.length(), greaterThan(2 * desiredBundleSize)); FileBasedSource<String> source = TextIO.read().from(largeTxt.getPath()).getSource(); List<? extends FileBasedSource<String>> splits = source.split(desiredBundleSize, options); // At least 2 splits and they are equal to reading the whole file. assertThat(splits, hasSize(greaterThan(1))); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); } @Test public void testInitialSplitAutoModeGz() throws Exception { long desiredBundleSize = 1000; PipelineOptions options = TestPipeline.testingPipelineOptions(); // Sanity check: file is at least 2 bundles long. assertThat(largeGz.length(), greaterThan(2 * desiredBundleSize)); FileBasedSource<String> source = TextIO.read().from(largeGz.getPath()).getSource(); List<? extends FileBasedSource<String>> splits = source.split(desiredBundleSize, options); // Exactly 1 split, even in AUTO mode, since it is a gzip file. assertThat(splits, hasSize(equalTo(1))); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); } @Test public void testInitialSplitGzipModeTxt() throws Exception { PipelineOptions options = TestPipeline.testingPipelineOptions(); long desiredBundleSize = 1000; // Sanity check: file is at least 2 bundles long. assertThat(largeTxt.length(), greaterThan(2 * desiredBundleSize)); FileBasedSource<String> source = TextIO.read().from(largeTxt.getPath()).withCompressionType(GZIP).getSource(); List<? extends FileBasedSource<String>> splits = source.split(desiredBundleSize, options); // Exactly 1 split, even though splittable text file, since using GZIP mode. assertThat(splits, hasSize(equalTo(1))); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); } @Test public void testInitialSplitGzipModeGz() throws Exception { PipelineOptions options = TestPipeline.testingPipelineOptions(); long desiredBundleSize = 1000; // Sanity check: file is at least 2 bundles long. assertThat(largeGz.length(), greaterThan(2 * desiredBundleSize)); FileBasedSource<String> source = TextIO.read().from(largeGz.getPath()).withCompressionType(GZIP).getSource(); List<? extends FileBasedSource<String>> splits = source.split(desiredBundleSize, options); // Exactly 1 split using .gz extension and using GZIP mode. assertThat(splits, hasSize(equalTo(1))); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); } @Test public void testWindowedWriteRequiresFilenamePolicy() { PCollection<String> emptyInput = p.apply(Create.empty(StringUtf8Coder.of())); TextIO.Write write = TextIO.write().to("/tmp/some/file").withWindowedWrites(); expectedException.expect(IllegalStateException.class); expectedException.expectMessage( "When using windowed writes, a filename policy must be set via withFilenamePolicy()"); emptyInput.apply(write); } }