FileBasedSourceTest.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io;

import static org.apache.beam.sdk.testing.SourceTestUtils.assertSplitAtFractionExhaustive;
import static org.apache.beam.sdk.testing.SourceTestUtils.assertSplitAtFractionFails;
import static org.apache.beam.sdk.testing.SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent;
import static org.apache.beam.sdk.testing.SourceTestUtils.readFromSource;
import static org.hamcrest.Matchers.containsInAnyOrder;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.SeekableByteChannel;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Random;
import javax.annotation.Nullable;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.io.FileBasedSource.FileBasedReader;
import org.apache.beam.sdk.io.Source.Reader;
import org.apache.beam.sdk.io.fs.MatchResult.Metadata;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
import org.apache.beam.sdk.testing.NeedsRunner;
import org.apache.beam.sdk.testing.PAssert;
import org.apache.beam.sdk.testing.TestPipeline;
import org.apache.beam.sdk.util.CoderUtils;
import org.apache.beam.sdk.values.PCollection;
import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.rules.ExpectedException;
import org.junit.rules.TemporaryFolder;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;

/**
 * Tests code common to all file-based sources.
 */
@RunWith(JUnit4.class)
public class FileBasedSourceTest {

  private Random random = new Random(0L);

  @Rule public final TestPipeline p = TestPipeline.create();
  @Rule public TemporaryFolder tempFolder = new TemporaryFolder();
  @Rule public ExpectedException thrown = ExpectedException.none();

  /**
   * If {@code splitHeader} is null, this is just a simple line-based reader. Otherwise, the file is
   * considered to consist of blocks beginning with {@code splitHeader}. The header itself is not
   * returned as a record. The first record after the header is considered to be a split point.
   *
   * <p>E.g., if {@code splitHeader} is "h" and the lines of the file are: h, a, b, h, h, c, then
   * the records in this source are a,b,c, and records a and c are split points.
   */
  static class TestFileBasedSource extends FileBasedSource<String> {

    final String splitHeader;

    public TestFileBasedSource(String fileOrPattern, long minBundleSize, String splitHeader) {
      super(StaticValueProvider.of(fileOrPattern), minBundleSize);
      this.splitHeader = splitHeader;
    }

    public TestFileBasedSource(
        Metadata fileOrPattern,
        long minBundleSize,
        long startOffset,
        long endOffset,
        @Nullable String splitHeader) {
      super(fileOrPattern, minBundleSize, startOffset, endOffset);
      this.splitHeader = splitHeader;
    }

    @Override
    public void validate() {}

    @Override
    public Coder<String> getDefaultOutputCoder() {
      return StringUtf8Coder.of();
    }

    @Override
    protected FileBasedSource<String> createForSubrangeOfFile(
        Metadata fileName, long start, long end) {
      return new TestFileBasedSource(fileName, getMinBundleSize(), start, end, splitHeader);
    }

    @Override
    protected FileBasedReader<String> createSingleFileReader(PipelineOptions options) {
      if (splitHeader == null) {
        return new TestReader(this);
      } else {
        return new TestReaderWithSplits(this);
      }
    }
  }

  /**
   * A utility class that starts reading lines from a given offset in a file until EOF.
   */
  private static class LineReader {
    private ReadableByteChannel channel = null;
    private long nextLineStart = 0;
    private long currentLineStart = 0;
    private final ByteBuffer buf;
    private static final int BUF_SIZE = 1024;
    private String currentValue = null;

    public LineReader(ReadableByteChannel channel) throws IOException {
      buf = ByteBuffer.allocate(BUF_SIZE);
      buf.flip();

      boolean removeLine = false;
      // If we are not at the beginning of a line, we should ignore the current line.
      if (channel instanceof SeekableByteChannel) {
        SeekableByteChannel seekChannel = (SeekableByteChannel) channel;
        if (seekChannel.position() > 0) {
          // Start from one character back and read till we find a new line.
          seekChannel.position(seekChannel.position() - 1);
          removeLine = true;
        }
        nextLineStart = seekChannel.position();
      }
      this.channel = channel;
      if (removeLine) {
        nextLineStart += readNextLine(new ByteArrayOutputStream());
      }
    }

    private int readNextLine(ByteArrayOutputStream out) throws IOException {
      int byteCount = 0;
      while (true) {
        if (!buf.hasRemaining()) {
          buf.clear();
          int read = channel.read(buf);
          if (read < 0) {
            break;
          }
          buf.flip();
        }
        byte b = buf.get();
        byteCount++;
        if (b == '\n') {
          break;
        }
        out.write(b);
      }
      return byteCount;
    }

    public boolean readNextLine() throws IOException {
      currentLineStart = nextLineStart;

      ByteArrayOutputStream buf = new ByteArrayOutputStream();
      int offsetAdjustment = readNextLine(buf);
      if (offsetAdjustment == 0) {
        // EOF
        return false;
      }
      nextLineStart += offsetAdjustment;
      // When running on Windows, each line obtained from 'readNextLine()' will end with a '\r'
      // since we use '\n' as the line boundary of the reader. So we trim it off here.
      currentValue = CoderUtils.decodeFromByteArray(StringUtf8Coder.of(), buf.toByteArray()).trim();
      return true;
    }

    public String getCurrent() {
      return currentValue;
    }

    public long getCurrentLineStart() {
      return currentLineStart;
    }
  }

  /**
   * A reader that can read lines of text from a {@link TestFileBasedSource}. This reader does not
   * consider {@code splitHeader} defined by {@code TestFileBasedSource} hence every line can be the
   * first line of a split.
   */
  private static class TestReader extends FileBasedReader<String> {
    private LineReader lineReader = null;

    public TestReader(TestFileBasedSource source) {
      super(source);
    }

    @Override
    protected void startReading(ReadableByteChannel channel) throws IOException {
      this.lineReader = new LineReader(channel);
    }

    @Override
    protected boolean readNextRecord() throws IOException {
      return lineReader.readNextLine();
    }

    @Override
    protected boolean isAtSplitPoint() {
      return true;
    }

    @Override
    protected long getCurrentOffset() {
      return lineReader.getCurrentLineStart();
    }

    @Override
    public String getCurrent() throws NoSuchElementException {
      return lineReader.getCurrent();
    }
  }

  /**
   * A reader that can read lines of text from a {@link TestFileBasedSource}. This reader considers
   * {@code splitHeader} defined by {@code TestFileBasedSource} hence only lines that immediately
   * follow a {@code splitHeader} are split points.
   */
  private static class TestReaderWithSplits extends FileBasedReader<String> {
    private LineReader lineReader;
    private final String splitHeader;
    private boolean foundFirstSplitPoint = false;
    private boolean isAtSplitPoint = false;
    private long currentOffset;

    public TestReaderWithSplits(TestFileBasedSource source) {
      super(source);
      this.splitHeader = source.splitHeader;
    }

    @Override
    protected void startReading(ReadableByteChannel channel) throws IOException {
      this.lineReader = new LineReader(channel);
    }

    @Override
    protected boolean readNextRecord() throws IOException {
      if (!foundFirstSplitPoint) {
        while (!isAtSplitPoint) {
          if (!readNextRecordInternal()) {
            return false;
          }
        }
        foundFirstSplitPoint = true;
        return true;
      }
      return readNextRecordInternal();
    }

    private boolean readNextRecordInternal() throws IOException {
      isAtSplitPoint = false;
      if (!lineReader.readNextLine()) {
        return false;
      }
      currentOffset = lineReader.getCurrentLineStart();
      while (getCurrent().equals(splitHeader)) {
        currentOffset = lineReader.getCurrentLineStart();
        if (!lineReader.readNextLine()) {
          return false;
        }
        isAtSplitPoint = true;
      }
      return true;
    }

    @Override
    protected boolean isAtSplitPoint() {
      return isAtSplitPoint;
    }

    @Override
    protected long getCurrentOffset() {
      return currentOffset;
    }

    @Override
    public String getCurrent() throws NoSuchElementException {
      return lineReader.getCurrent();
    }
  }

  public File createFileWithData(String fileName, List<String> data) throws IOException {
    File file = tempFolder.newFile(fileName);
    Files.write(file.toPath(), data, StandardCharsets.UTF_8);
    return file;
  }

  private String createRandomString(int length) {
    char[] chars = "abcdefghijklmnopqrstuvwxyz".toCharArray();
    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < length; i++) {
      builder.append(chars[random.nextInt(chars.length)]);
    }
    return builder.toString();
  }

  public List<String> createStringDataset(int dataItemLength, int numItems) {
    List<String> list = new ArrayList<String>();
    for (int i = 0; i < numItems; i++) {
      list.add(createRandomString(dataItemLength));
    }
    return list;
  }

  @Test
  public void testFullyReadSingleFile() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    List<String> data = createStringDataset(3, 50);

    String fileName = "file";
    File file = createFileWithData(fileName, data);

    TestFileBasedSource source = new TestFileBasedSource(file.getPath(), 64, null);
    assertEquals(data, readFromSource(source, options));
  }

  @Test
  public void testFullyReadFilePattern() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    List<String> data1 = createStringDataset(3, 50);
    File file1 = createFileWithData("file1", data1);

    List<String> data2 = createStringDataset(3, 50);
    createFileWithData("file2", data2);

    List<String> data3 = createStringDataset(3, 50);
    createFileWithData("file3", data3);

    List<String> data4 = createStringDataset(3, 50);
    createFileWithData("otherfile", data4);

    TestFileBasedSource source =
        new TestFileBasedSource(new File(file1.getParent(), "file*").getPath(), 64, null);
    List<String> expectedResults = new ArrayList<String>();
    expectedResults.addAll(data1);
    expectedResults.addAll(data2);
    expectedResults.addAll(data3);
    assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
  }

  @Test
  public void testCloseUnstartedFilePatternReader() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    List<String> data1 = createStringDataset(3, 50);
    File file1 = createFileWithData("file1", data1);

    List<String> data2 = createStringDataset(3, 50);
    createFileWithData("file2", data2);

    List<String> data3 = createStringDataset(3, 50);
    createFileWithData("file3", data3);

    List<String> data4 = createStringDataset(3, 50);
    createFileWithData("otherfile", data4);

    TestFileBasedSource source =
        new TestFileBasedSource(new File(file1.getParent(), "file*").getPath(), 64, null);
    Reader<String> reader = source.createReader(options);
    // Closing an unstarted FilePatternReader should not throw an exception.
    try {
      reader.close();
    } catch (Exception e) {
      fail("Closing an unstarted FilePatternReader should not throw an exception");
    }
  }

  @Test
  public void testSplittingFailsOnEmptyFileExpansion() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    String missingFilePath = tempFolder.newFolder().getAbsolutePath() + "/missing.txt";
    TestFileBasedSource source = new TestFileBasedSource(missingFilePath, Long.MAX_VALUE, null);
    thrown.expect(FileNotFoundException.class);
    thrown.expectMessage(String.format("No files found for spec: %s", missingFilePath));
    source.split(1234, options);
  }

  @Test
  public void testFractionConsumedWhenReadingFilepattern() throws IOException {
    List<String> data1 = createStringDataset(3, 1000);
    File file1 = createFileWithData("file1", data1);

    List<String> data2 = createStringDataset(3, 1000);
    createFileWithData("file2", data2);

    List<String> data3 = createStringDataset(3, 1000);
    createFileWithData("file3", data3);

    TestFileBasedSource source =
        new TestFileBasedSource(file1.getParent() + "/" + "file*", 1024, null);
    try (BoundedSource.BoundedReader<String> reader = source.createReader(null)) {
      double lastFractionConsumed = 0.0;
      assertEquals(0.0, reader.getFractionConsumed(), 1e-6);
      assertTrue(reader.start());
      assertTrue(reader.advance());
      assertTrue(reader.advance());
      // We're inside the first file. Should be in [0, 1/3).
      assertTrue(reader.getFractionConsumed() > 0.0);
      assertTrue(reader.getFractionConsumed() < 1.0 / 3.0);
      while (reader.advance()) {
        double fractionConsumed = reader.getFractionConsumed();
        assertTrue(fractionConsumed > lastFractionConsumed);
        lastFractionConsumed = fractionConsumed;
      }
      assertEquals(1.0, reader.getFractionConsumed(), 1e-6);
    }
  }

  @Test
  public void testFullyReadFilePatternFirstRecordEmpty() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    File file1 = createFileWithData("file1", new ArrayList<String>());

    String pattern = file1.getParent() + "/file*";

    List<String> data2 = createStringDataset(3, 50);
    createFileWithData("file2", data2);

    List<String> data3 = createStringDataset(3, 50);
    createFileWithData("file3", data3);

    List<String> data4 = createStringDataset(3, 50);
    createFileWithData("otherfile", data4);

    TestFileBasedSource source = new TestFileBasedSource(pattern, 64, null);

    List<String> expectedResults = new ArrayList<String>();
    expectedResults.addAll(data2);
    expectedResults.addAll(data3);
    assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
  }

  @Test
  public void testReadRangeAtStart() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    List<String> data = createStringDataset(3, 50);

    String fileName = "file";
    File file = createFileWithData(fileName, data);

    Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
    TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 25, null);
    TestFileBasedSource source2 =
        new TestFileBasedSource(metadata, 64, 25, Long.MAX_VALUE, null);

    List<String> results = new ArrayList<String>();
    results.addAll(readFromSource(source1, options));
    results.addAll(readFromSource(source2, options));
    assertThat(data, containsInAnyOrder(results.toArray()));
  }

  @Test
  public void testReadEverythingFromFileWithSplits() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    String header = "<h>";
    List<String> data = new ArrayList<>();
    for (int i = 0; i < 10; i++) {
      data.add(header);
      data.addAll(createStringDataset(3, 9));
    }
    String fileName = "file";
    File file = createFileWithData(fileName, data);

    TestFileBasedSource source = new TestFileBasedSource(file.getPath(), 64, header);

    List<String> expectedResults = new ArrayList<String>();
    expectedResults.addAll(data);
    // Remove all occurrences of header from expected results.
    expectedResults.removeAll(Collections.singletonList(header));

    assertEquals(expectedResults, readFromSource(source, options));
  }

  @Test
  public void testReadRangeFromFileWithSplitsFromStart() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    String header = "<h>";
    List<String> data = new ArrayList<>();
    for (int i = 0; i < 10; i++) {
      data.add(header);
      data.addAll(createStringDataset(3, 9));
    }
    String fileName = "file";
    File file = createFileWithData(fileName, data);

    Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
    TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 60, header);
    TestFileBasedSource source2 =
        new TestFileBasedSource(metadata, 64, 60, Long.MAX_VALUE, header);

    List<String> expectedResults = new ArrayList<String>();
    expectedResults.addAll(data);
    // Remove all occurrences of header from expected results.
    expectedResults.removeAll(Arrays.asList(header));

    List<String> results = new ArrayList<>();
    results.addAll(readFromSource(source1, options));
    results.addAll(readFromSource(source2, options));

    assertThat(expectedResults, containsInAnyOrder(results.toArray()));
  }

  @Test
  public void testReadRangeFromFileWithSplitsFromMiddle() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    String header = "<h>";
    List<String> data = new ArrayList<>();
    for (int i = 0; i < 10; i++) {
      data.add(header);
      data.addAll(createStringDataset(3, 9));
    }
    String fileName = "file";
    File file = createFileWithData(fileName, data);

    Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
    TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 42, header);
    TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 42, 112, header);
    TestFileBasedSource source3 =
        new TestFileBasedSource(metadata, 64, 112, Long.MAX_VALUE, header);

    List<String> expectedResults = new ArrayList<String>();

    expectedResults.addAll(data);
    // Remove all occurrences of header from expected results.
    expectedResults.removeAll(Collections.singletonList(header));

    List<String> results = new ArrayList<>();
    results.addAll(readFromSource(source1, options));
    results.addAll(readFromSource(source2, options));
    results.addAll(readFromSource(source3, options));

    assertThat(expectedResults, containsInAnyOrder(results.toArray()));
  }

  @Test
  public void testReadFileWithSplitsWithEmptyRange() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    String header = "<h>";
    List<String> data = new ArrayList<>();
    for (int i = 0; i < 5; i++) {
      data.add(header);
      data.addAll(createStringDataset(3, 9));
    }
    String fileName = "file";
    File file = createFileWithData(fileName, data);

    Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
    TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 42, header);
    TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 42, 62, header);
    TestFileBasedSource source3 =
        new TestFileBasedSource(metadata, 64, 62, Long.MAX_VALUE, header);

    List<String> expectedResults = new ArrayList<String>();

    expectedResults.addAll(data);
    // Remove all occurrences of header from expected results.
    expectedResults.removeAll(Collections.singletonList(header));

    List<String> results = new ArrayList<>();
    results.addAll(readFromSource(source1, options));
    results.addAll(readFromSource(source2, options));
    results.addAll(readFromSource(source3, options));

    assertThat(expectedResults, containsInAnyOrder(results.toArray()));
  }

  @Test
  public void testReadRangeFromFileWithSplitsFromMiddleOfHeader() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    String header = "<h>";
    List<String> data = new ArrayList<>();
    for (int i = 0; i < 10; i++) {
      data.add(header);
      data.addAll(createStringDataset(3, 9));
    }
    String fileName = "file";
    File file = createFileWithData(fileName, data);

    List<String> expectedResults = new ArrayList<String>();
    expectedResults.addAll(data.subList(10, data.size()));
    // Remove all occurrences of header from expected results.
    expectedResults.removeAll(Collections.singletonList(header));

    Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
    // Split starts after "<" of the header
    TestFileBasedSource source =
        new TestFileBasedSource(metadata, 64, 1, Long.MAX_VALUE, header);
    assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));

    // Split starts after "<h" of the header
    source = new TestFileBasedSource(metadata, 64, 2, Long.MAX_VALUE, header);
    assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));

    // Split starts after "<h>" of the header
    source = new TestFileBasedSource(metadata, 64, 3, Long.MAX_VALUE, header);
    assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
  }

  @Test
  public void testReadRangeAtMiddle() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    List<String> data = createStringDataset(3, 50);
    String fileName = "file";
    File file = createFileWithData(fileName, data);

    Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
    TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 52, null);
    TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 52, 72, null);
    TestFileBasedSource source3 =
        new TestFileBasedSource(metadata, 64, 72, Long.MAX_VALUE, null);

    List<String> results = new ArrayList<>();
    results.addAll(readFromSource(source1, options));
    results.addAll(readFromSource(source2, options));
    results.addAll(readFromSource(source3, options));

    assertThat(data, containsInAnyOrder(results.toArray()));
  }

  @Test
  public void testReadRangeAtEnd() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    List<String> data = createStringDataset(3, 50);

    String fileName = "file";
    File file = createFileWithData(fileName, data);

    Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
    TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 162, null);
    TestFileBasedSource source2 =
        new TestFileBasedSource(metadata, 1024, 162, Long.MAX_VALUE, null);

    List<String> results = new ArrayList<>();
    results.addAll(readFromSource(source1, options));
    results.addAll(readFromSource(source2, options));

    assertThat(data, containsInAnyOrder(results.toArray()));
  }

  @Test
  public void testReadAllSplitsOfSingleFile() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    List<String> data = createStringDataset(3, 50);

    String fileName = "file";
    File file = createFileWithData(fileName, data);

    TestFileBasedSource source = new TestFileBasedSource(file.getPath(), 16, null);

    List<? extends BoundedSource<String>> sources = source.split(32, null);

    // Not a trivial split.
    assertTrue(sources.size() > 1);

    List<String> results = new ArrayList<String>();
    for (BoundedSource<String> split : sources) {
      results.addAll(readFromSource(split, options));
    }

    assertThat(data, containsInAnyOrder(results.toArray()));
  }

  @Test
  @Category(NeedsRunner.class)
  public void testDataflowFile() throws IOException {
    List<String> data = createStringDataset(3, 50);

    String fileName = "file";
    File file = createFileWithData(fileName, data);

    TestFileBasedSource source = new TestFileBasedSource(file.getPath(), 64, null);
    PCollection<String> output = p.apply("ReadFileData", Read.from(source));

    PAssert.that(output).containsInAnyOrder(data);
    p.run();
  }

  @Test
  @Category(NeedsRunner.class)
  public void testDataflowFilePattern() throws IOException {

    List<String> data1 = createStringDataset(3, 50);
    File file1 = createFileWithData("file1", data1);

    List<String> data2 = createStringDataset(3, 50);
    createFileWithData("file2", data2);

    List<String> data3 = createStringDataset(3, 50);
    createFileWithData("file3", data3);

    List<String> data4 = createStringDataset(3, 50);
    createFileWithData("otherfile", data4);

    TestFileBasedSource source =
        new TestFileBasedSource(new File(file1.getParent(), "file*").getPath(), 64, null);

    PCollection<String> output = p.apply("ReadFileData", Read.from(source));

    List<String> expectedResults = new ArrayList<String>();
    expectedResults.addAll(data1);
    expectedResults.addAll(data2);
    expectedResults.addAll(data3);

    PAssert.that(output).containsInAnyOrder(expectedResults);
    p.run();
  }

  @Test
  public void testEstimatedSizeOfFile() throws Exception {
    List<String> data = createStringDataset(3, 50);
    String fileName = "file";
    File file = createFileWithData(fileName, data);

    TestFileBasedSource source = new TestFileBasedSource(file.getPath(), 64, null);
    assertEquals(file.length(), source.getEstimatedSizeBytes(null));
  }

  @Test
  public void testEstimatedSizeOfFilePattern() throws Exception {
    List<String> data1 = createStringDataset(3, 20);
    File file1 = createFileWithData("file1", data1);

    List<String> data2 = createStringDataset(3, 40);
    File file2 = createFileWithData("file2", data2);

    List<String> data3 = createStringDataset(3, 30);
    File file3 = createFileWithData("file3", data3);

    List<String> data4 = createStringDataset(3, 45);
    createFileWithData("otherfile", data4);

    List<String> data5 = createStringDataset(3, 53);
    createFileWithData("anotherfile", data5);

    TestFileBasedSource source =
        new TestFileBasedSource(new File(file1.getParent(), "file*").getPath(), 64, null);

    // Estimated size of the file pattern based source should be the total size of files that the
    // corresponding pattern is expanded into.
    assertEquals(
        file1.length() + file2.length() + file3.length(), source.getEstimatedSizeBytes(null));
  }

  @Test
  public void testReadAllSplitsOfFilePattern() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    List<String> data1 = createStringDataset(3, 50);
    File file1 = createFileWithData("file1", data1);

    List<String> data2 = createStringDataset(3, 50);
    createFileWithData("file2", data2);

    List<String> data3 = createStringDataset(3, 50);
    createFileWithData("file3", data3);

    List<String> data4 = createStringDataset(3, 50);
    createFileWithData("otherfile", data4);

    TestFileBasedSource source =
        new TestFileBasedSource(new File(file1.getParent(), "file*").getPath(), 64, null);
    List<? extends BoundedSource<String>> sources = source.split(512, null);

    // Not a trivial split.
    assertTrue(sources.size() > 1);

    List<String> results = new ArrayList<String>();
    for (BoundedSource<String> split : sources) {
      results.addAll(readFromSource(split, options));
    }

    List<String> expectedResults = new ArrayList<String>();
    expectedResults.addAll(data1);
    expectedResults.addAll(data2);
    expectedResults.addAll(data3);

    assertThat(expectedResults, containsInAnyOrder(results.toArray()));
  }

  @Test
  public void testSplitAtFraction() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    File file = createFileWithData("file", createStringDataset(3, 100));

    Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
    TestFileBasedSource source = new TestFileBasedSource(metadata, 1, 0, file.length(), null);
    // Shouldn't be able to split while unstarted.
    assertSplitAtFractionFails(source, 0, 0.7, options);
    assertSplitAtFractionSucceedsAndConsistent(source, 1, 0.7, options);
    assertSplitAtFractionSucceedsAndConsistent(source, 30, 0.7, options);
    assertSplitAtFractionFails(source, 0, 0.0, options);
    assertSplitAtFractionFails(source, 70, 0.3, options);
    assertSplitAtFractionFails(source, 100, 1.0, options);
    assertSplitAtFractionFails(source, 100, 0.99, options);
    assertSplitAtFractionSucceedsAndConsistent(source, 100, 0.995, options);
  }

  @Test
  public void testSplitAtFractionExhaustive() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    // Smaller file for exhaustive testing.
    File file = createFileWithData("file", createStringDataset(3, 20));

    Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
    TestFileBasedSource source = new TestFileBasedSource(metadata, 1, 0, file.length(), null);
    assertSplitAtFractionExhaustive(source, options);
  }

  @Test
  public void testToStringFile() throws Exception {
    File f = createFileWithData("foo", Collections.<String>emptyList());
    Metadata metadata = FileSystems.matchSingleFileSpec(f.getPath());
    TestFileBasedSource source = new TestFileBasedSource(metadata, 1, 0, 10, null);
    assertEquals(String.format("%s range [0, 10)", f.getAbsolutePath()), source.toString());
  }
}