TextSource.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io;

import static com.google.common.base.Preconditions.checkState;

import com.google.common.annotations.VisibleForTesting;
import com.google.protobuf.ByteString;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.SeekableByteChannel;
import java.util.NoSuchElementException;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.io.fs.MatchResult;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.ValueProvider;

/**
 * Implementation detail of {@link TextIO.Read}.
 *
 * <p>A {@link FileBasedSource} which can decode records delimited by newline characters.
 *
 * <p>This source splits the data into records using {@code UTF-8} {@code \n}, {@code \r}, or
 * {@code \r\n} as the delimiter. This source is not strict and supports decoding the last record
 * even if it is not delimited. Finally, no records are decoded if the stream is empty.
 *
 * <p>This source supports reading from any arbitrary byte position within the stream. If the
 * starting position is not {@code 0}, then bytes are skipped until the first delimiter is found
 * representing the beginning of the first record to be decoded.
 */
@VisibleForTesting
class TextSource extends FileBasedSource<String> {
  TextSource(ValueProvider<String> fileSpec) {
    super(fileSpec, 1L);
  }

  private TextSource(MatchResult.Metadata metadata, long start, long end) {
    super(metadata, 1L, start, end);
  }

  @Override
  protected FileBasedSource<String> createForSubrangeOfFile(
      MatchResult.Metadata metadata,
      long start,
      long end) {
    return new TextSource(metadata, start, end);
  }

  @Override
  protected FileBasedReader<String> createSingleFileReader(PipelineOptions options) {
    return new TextBasedReader(this);
  }

  @Override
  public Coder<String> getDefaultOutputCoder() {
    return StringUtf8Coder.of();
  }

  /**
   * A {@link FileBasedReader FileBasedReader}
   * which can decode records delimited by newline characters.
   *
   * <p>See {@link TextSource} for further details.
   */
  @VisibleForTesting
  static class TextBasedReader extends FileBasedReader<String> {
    private static final int READ_BUFFER_SIZE = 8192;
    private final ByteBuffer readBuffer = ByteBuffer.allocate(READ_BUFFER_SIZE);
    private ByteString buffer;
    private int startOfSeparatorInBuffer;
    private int endOfSeparatorInBuffer;
    private long startOfRecord;
    private volatile long startOfNextRecord;
    private volatile boolean eof;
    private volatile boolean elementIsPresent;
    private String currentValue;
    private ReadableByteChannel inChannel;

    private TextBasedReader(TextSource source) {
      super(source);
      buffer = ByteString.EMPTY;
    }

    @Override
    protected long getCurrentOffset() throws NoSuchElementException {
      if (!elementIsPresent) {
        throw new NoSuchElementException();
      }
      return startOfRecord;
    }

    @Override
    public long getSplitPointsRemaining() {
      if (isStarted() && startOfNextRecord >= getCurrentSource().getEndOffset()) {
        return isDone() ? 0 : 1;
      }
      return super.getSplitPointsRemaining();
    }

    @Override
    public String getCurrent() throws NoSuchElementException {
      if (!elementIsPresent) {
        throw new NoSuchElementException();
      }
      return currentValue;
    }

    @Override
    protected void startReading(ReadableByteChannel channel) throws IOException {
      this.inChannel = channel;
      // If the first offset is greater than zero, we need to skip bytes until we see our
      // first separator.
      if (getCurrentSource().getStartOffset() > 0) {
        checkState(channel instanceof SeekableByteChannel,
            "%s only supports reading from a SeekableByteChannel when given a start offset"
            + " greater than 0.", TextSource.class.getSimpleName());
        long requiredPosition = getCurrentSource().getStartOffset() - 1;
        ((SeekableByteChannel) channel).position(requiredPosition);
        findSeparatorBounds();
        buffer = buffer.substring(endOfSeparatorInBuffer);
        startOfNextRecord = requiredPosition + endOfSeparatorInBuffer;
        endOfSeparatorInBuffer = 0;
        startOfSeparatorInBuffer = 0;
      }
    }

    /**
     * Locates the start position and end position of the next delimiter. Will
     * consume the channel till either EOF or the delimiter bounds are found.
     *
     * <p>This fills the buffer and updates the positions as follows:
     * <pre>{@code
     * ------------------------------------------------------
     * | element bytes | delimiter bytes | unconsumed bytes |
     * ------------------------------------------------------
     * 0            start of          end of              buffer
     *              separator         separator           size
     *              in buffer         in buffer
     * }</pre>
     */
    private void findSeparatorBounds() throws IOException {
      int bytePositionInBuffer = 0;
      while (true) {
        if (!tryToEnsureNumberOfBytesInBuffer(bytePositionInBuffer + 1)) {
          startOfSeparatorInBuffer = endOfSeparatorInBuffer = bytePositionInBuffer;
          break;
        }

        byte currentByte = buffer.byteAt(bytePositionInBuffer);

        if (currentByte == '\n') {
          startOfSeparatorInBuffer = bytePositionInBuffer;
          endOfSeparatorInBuffer = startOfSeparatorInBuffer + 1;
          break;
        } else if (currentByte == '\r') {
          startOfSeparatorInBuffer = bytePositionInBuffer;
          endOfSeparatorInBuffer = startOfSeparatorInBuffer + 1;

          if (tryToEnsureNumberOfBytesInBuffer(bytePositionInBuffer + 2)) {
            currentByte = buffer.byteAt(bytePositionInBuffer + 1);
            if (currentByte == '\n') {
              endOfSeparatorInBuffer += 1;
            }
          }
          break;
        }

        // Move to the next byte in buffer.
        bytePositionInBuffer += 1;
      }
    }

    @Override
    protected boolean readNextRecord() throws IOException {
      startOfRecord = startOfNextRecord;
      findSeparatorBounds();

      // If we have reached EOF file and consumed all of the buffer then we know
      // that there are no more records.
      if (eof && buffer.size() == 0) {
        elementIsPresent = false;
        return false;
      }

      decodeCurrentElement();
      startOfNextRecord = startOfRecord + endOfSeparatorInBuffer;
      return true;
    }

    /**
     * Decodes the current element updating the buffer to only contain the unconsumed bytes.
     *
     * <p>This invalidates the currently stored {@code startOfSeparatorInBuffer} and
     * {@code endOfSeparatorInBuffer}.
     */
    private void decodeCurrentElement() throws IOException {
      ByteString dataToDecode = buffer.substring(0, startOfSeparatorInBuffer);
      currentValue = dataToDecode.toStringUtf8();
      elementIsPresent = true;
      buffer = buffer.substring(endOfSeparatorInBuffer);
    }

    /**
     * Returns false if we were unable to ensure the minimum capacity by consuming the channel.
     */
    private boolean tryToEnsureNumberOfBytesInBuffer(int minCapacity) throws IOException {
      // While we aren't at EOF or haven't fulfilled the minimum buffer capacity,
      // attempt to read more bytes.
      while (buffer.size() <= minCapacity && !eof) {
        eof = inChannel.read(readBuffer) == -1;
        readBuffer.flip();
        buffer = buffer.concat(ByteString.copyFrom(readBuffer));
        readBuffer.clear();
      }
      // Return true if we were able to honor the minimum buffer capacity request
      return buffer.size() >= minCapacity;
    }
  }
}