XmlSource.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io.xml;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.SequenceInputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.StandardCharsets;
import java.util.NoSuchElementException;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBElement;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.stream.FactoryConfigurationError;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.io.FileBasedSource;
import org.apache.beam.sdk.io.fs.MatchResult.Metadata;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.codehaus.stax2.XMLInputFactory2;

/** Implementation of {@link XmlIO#read}. */
public class XmlSource<T> extends FileBasedSource<T> {

  private static final String XML_VERSION = "1.1";

  private final XmlIO.Read<T> spec;

  XmlSource(XmlIO.Read<T> spec) {
    super(StaticValueProvider.of(spec.getFileOrPatternSpec()), spec.getMinBundleSize());
    this.spec = spec;
  }

  private XmlSource(XmlIO.Read<T> spec, Metadata metadata, long startOffset, long endOffset) {
    super(metadata, spec.getMinBundleSize(), startOffset, endOffset);
    this.spec = spec;
  }

  @Override
  protected FileBasedSource<T> createForSubrangeOfFile(Metadata metadata, long start, long end) {
    return new XmlSource<T>(spec.from(metadata.toString()), metadata, start, end);
  }

  @Override
  protected FileBasedReader<T> createSingleFileReader(PipelineOptions options) {
    return new XMLReader<T>(this);
  }

  @Override
  public void validate() {
    super.validate();
    spec.validate(null);
  }

  @Override
  public void populateDisplayData(DisplayData.Builder builder) {
    spec.populateDisplayData(builder);
  }

  @Override
  public Coder<T> getDefaultOutputCoder() {
    return JAXBCoder.of(spec.getRecordClass());
  }

  /**
   * A {@link Source.Reader} for reading JAXB annotated Java objects from an XML file. The XML
   * file should be of the form defined at {@link XmlSource}.
   *
   * <p>Timestamped values are currently unsupported - all values implicitly have the timestamp
   * of {@code BoundedWindow.TIMESTAMP_MIN_VALUE}.
   *
   * @param <T> Type of objects that will be read by the reader.
   */
  private static class XMLReader<T> extends FileBasedReader<T> {
    // The amount of bytes read from the channel to memory when determining the starting offset of
    // the first record in a bundle. After matching to starting offset of the first record the
    // remaining bytes read to this buffer and the bytes still not read from the channel are used to
    // create the XML parser.
    private static final int BUF_SIZE = 1024;

    // This should be the maximum number of bytes a character will encode to, for any encoding
    // supported by XmlSource. Currently this is set to 4 since UTF-8 characters may be
    // four bytes.
    private static final int MAX_CHAR_BYTES = 4;

    // In order to support reading starting in the middle of an XML file, we construct an imaginary
    // well-formed document (a header and root tag followed by the contents of the input starting at
    // the record boundary) and feed it to the parser. Because of this, the offset reported by the
    // XML parser is not the same as offset in the original file. They differ by a constant amount:
    // offsetInOriginalFile = parser.getLocation().getCharacterOffset() + parserBaseOffset;
    // Note that this is true only for files with single-byte characters.
    // It appears that, as of writing, there does not exist a Java XML parser capable of correctly
    // reporting byte offsets of elements in the presence of multi-byte characters.
    private long parserBaseOffset = 0;
    private boolean readingStarted = false;

    // If true, the current bundle does not contain any records.
    private boolean emptyBundle = false;

    private Unmarshaller jaxbUnmarshaller = null;
    private XMLStreamReader parser = null;

    private T currentRecord = null;

    // Byte offset of the current record in the XML file provided when creating the source.
    private long currentByteOffset = 0;

    public XMLReader(XmlSource<T> source) {
      super(source);

      // Set up a JAXB Unmarshaller that can be used to unmarshall record objects.
      try {
        JAXBContext jaxbContext = JAXBContext.newInstance(getCurrentSource().spec.getRecordClass());
        jaxbUnmarshaller = jaxbContext.createUnmarshaller();
        if (getCurrentSource().spec.getValidationEventHandler() != null) {
          jaxbUnmarshaller.setEventHandler(getCurrentSource().spec.getValidationEventHandler());
        }
      } catch (JAXBException e) {
        throw new RuntimeException(e);
      }
    }

    @Override
    public synchronized XmlSource<T> getCurrentSource() {
      return (XmlSource<T>) super.getCurrentSource();
    }

    @Override
    protected void startReading(ReadableByteChannel channel) throws IOException {
      // This method determines the correct starting offset of the first record by reading bytes
      // from the ReadableByteChannel. This implementation does not need the channel to be a
      // SeekableByteChannel.
      // The method tries to determine the first record element in the byte channel. The first
      // record must start with the characters "<recordElement" where "recordElement" is the
      // record element of the XML document described above. For the match to be complete this
      // has to be followed by one of following.
      // * any whitespace character
      // * '>' character
      // * '/' character (to support empty records).
      //
      // After this match this method creates the XML parser for parsing the XML document,
      // feeding it a fake document consisting of an XML header and the <rootElement> tag followed
      // by the contents of channel starting from <recordElement. The <rootElement> tag may be never
      // closed.

      // This stores any bytes that should be used prior to the remaining bytes of the channel when
      // creating an XML parser object.
      ByteArrayOutputStream preambleByteBuffer = new ByteArrayOutputStream();
      // A dummy declaration and root for the document with proper XML version and encoding. Without
      // this XML parsing may fail or may produce incorrect results.

      byte[] dummyStartDocumentBytes =
          (String.format(
                  "<?xml version=\"%s\" encoding=\""
                      + getCurrentSource().spec.getCharset()
                      + "\"?><%s>",
                  XML_VERSION, getCurrentSource().spec.getRootElement()))
              .getBytes(getCurrentSource().spec.getCharset());
      preambleByteBuffer.write(dummyStartDocumentBytes);
      // Gets the byte offset (in the input file) of the first record in ReadableByteChannel. This
      // method returns the offset and stores any bytes that should be used when creating the XML
      // parser in preambleByteBuffer.
      long offsetInFileOfRecordElement =
          getFirstOccurenceOfRecordElement(channel, preambleByteBuffer);
      if (offsetInFileOfRecordElement < 0) {
        // Bundle has no records. So marking this bundle as an empty bundle.
        emptyBundle = true;
        return;
      } else {
        byte[] preambleBytes = preambleByteBuffer.toByteArray();
        currentByteOffset = offsetInFileOfRecordElement;
        setUpXMLParser(channel, preambleBytes);
        parserBaseOffset = offsetInFileOfRecordElement - dummyStartDocumentBytes.length;
      }
      readingStarted = true;
    }

    // Gets the first occurrence of the next record within the given ReadableByteChannel. Puts
    // any bytes read past the starting offset of the next record back to the preambleByteBuffer.
    // If a record is found, returns the starting offset of the record, otherwise
    // returns -1.
    private long getFirstOccurenceOfRecordElement(
        ReadableByteChannel channel, ByteArrayOutputStream preambleByteBuffer) throws IOException {
      int byteIndexInRecordElementToMatch = 0;
      // Index of the byte in the string "<recordElement" to be matched
      // against the current byte from the stream.
      boolean recordStartBytesMatched = false; // "<recordElement" matched. Still have to match the
      // next character to confirm if this is a positive match.
      boolean fullyMatched = false; // If true, record element was fully matched.

      // This gives the offset of the byte currently being read. We do a '-1' here since we
      // increment this value at the beginning of the while loop below.
      long offsetInFileOfCurrentByte = getCurrentSource().getStartOffset() - 1;
      long startingOffsetInFileOfCurrentMatch = -1;
      // If this is non-negative, currently there is a match in progress and this value gives the
      // starting offset of the match currently being conducted.
      boolean matchStarted = false; // If true, a match is currently in progress.

      // These two values are used to determine the character immediately following a match for
      // "<recordElement". Please see the comment for 'MAX_CHAR_BYTES' above.
      byte[] charBytes = new byte[MAX_CHAR_BYTES];
      int charBytesFound = 0;

      ByteBuffer buf = ByteBuffer.allocate(BUF_SIZE);
      byte[] recordStartBytes =
          ("<" + getCurrentSource().spec.getRecordElement()).getBytes(StandardCharsets.UTF_8);

      outer: while (channel.read(buf) > 0) {
        buf.flip();
        while (buf.hasRemaining()) {
          offsetInFileOfCurrentByte++;
          byte b = buf.get();
          boolean reset = false;
          if (recordStartBytesMatched) {
            // We already matched "<recordElement" reading the next character to determine if this
            // is a positive match for a new record.
            charBytes[charBytesFound] = b;
            charBytesFound++;
            Character c = null;
            if (charBytesFound == charBytes.length) {
              CharBuffer charBuf = CharBuffer.allocate(1);
              InputStream charBufStream = new ByteArrayInputStream(charBytes);
              java.io.Reader reader =
                  new InputStreamReader(charBufStream, StandardCharsets.UTF_8);
              int read = reader.read();
              if (read <= 0) {
                return -1;
              }
              charBuf.flip();
              c = (char) read;
            } else {
              continue;
            }

            // Record start may be of following forms
            // * "<recordElement<whitespace>..."
            // * "<recordElement>..."
            // * "<recordElement/..."
            if (Character.isWhitespace(c) || c == '>' || c == '/') {
              fullyMatched = true;
              // Add the recordStartBytes and charBytes to preambleByteBuffer since these were
              // already read from the channel.
              preambleByteBuffer.write(recordStartBytes);
              preambleByteBuffer.write(charBytes);
              // Also add the rest of the current buffer to preambleByteBuffer.
              while (buf.hasRemaining()) {
                preambleByteBuffer.write(buf.get());
              }
              break outer;
            } else {
              // Matching was unsuccessful. Reset the buffer to include bytes read for the char.
              ByteBuffer newbuf = ByteBuffer.allocate(BUF_SIZE);
              newbuf.put(charBytes);
              offsetInFileOfCurrentByte -= charBytes.length;
              while (buf.hasRemaining()) {
                newbuf.put(buf.get());
              }
              newbuf.flip();
              buf = newbuf;

              // Ignore everything and try again starting from the current buffer.
              reset = true;
            }
          } else if (b == recordStartBytes[byteIndexInRecordElementToMatch]) {
            // Next byte matched.
            if (!matchStarted) {
              // Match was for the first byte, record the starting offset.
              matchStarted = true;
              startingOffsetInFileOfCurrentMatch = offsetInFileOfCurrentByte;
            }
            byteIndexInRecordElementToMatch++;
          } else {
            // Not a match. Ignore everything and try again starting at current point.
            reset = true;
          }
          if (reset) {
            // Clear variables and try to match starting from the next byte.
            byteIndexInRecordElementToMatch = 0;
            startingOffsetInFileOfCurrentMatch = -1;
            matchStarted = false;
            recordStartBytesMatched = false;
            charBytes = new byte[MAX_CHAR_BYTES];
            charBytesFound = 0;
          }
          if (byteIndexInRecordElementToMatch == recordStartBytes.length) {
            // "<recordElement" matched. Need to still check next byte since this might be an
            // element that has "recordElement" as a prefix.
            recordStartBytesMatched = true;
          }
        }
        buf.clear();
      }

      if (!fullyMatched) {
        return -1;
      } else {
        return startingOffsetInFileOfCurrentMatch;
      }
    }

    private void setUpXMLParser(ReadableByteChannel channel, byte[] lookAhead) throws IOException {
      try {
        // We use Woodstox because the StAX implementation provided by OpenJDK reports
        // character locations incorrectly. Note that Woodstox still currently reports *byte*
        // locations incorrectly when parsing documents that contain multi-byte characters.
        XMLInputFactory2 xmlInputFactory = (XMLInputFactory2) XMLInputFactory.newInstance();
        this.parser = xmlInputFactory.createXMLStreamReader(
            new SequenceInputStream(
                new ByteArrayInputStream(lookAhead), Channels.newInputStream(channel)),
            getCurrentSource().spec.getCharset());

        // Current offset should be the offset before reading the record element.
        while (true) {
          int event = parser.next();
          if (event == XMLStreamConstants.START_ELEMENT) {
            String localName = parser.getLocalName();
            if (localName.equals(getCurrentSource().spec.getRecordElement())) {
              break;
            }
          }
        }
      } catch (FactoryConfigurationError | XMLStreamException e) {
        throw new IOException(e);
      }
    }

    @Override
    protected boolean readNextRecord() throws IOException {
      if (emptyBundle) {
        currentByteOffset = Long.MAX_VALUE;
        return false;
      }
      try {
        // Update current offset and check if the next value is the record element.
        currentByteOffset = parserBaseOffset + parser.getLocation().getCharacterOffset();
        while (parser.getEventType() != XMLStreamConstants.START_ELEMENT) {
          parser.next();
          currentByteOffset = parserBaseOffset + parser.getLocation().getCharacterOffset();
          if (parser.getEventType() == XMLStreamConstants.END_DOCUMENT) {
            currentByteOffset = Long.MAX_VALUE;
            return false;
          }
        }
        JAXBElement<T> jb =
            jaxbUnmarshaller.unmarshal(parser, getCurrentSource().spec.getRecordClass());
        currentRecord = jb.getValue();
        return true;
      } catch (JAXBException | XMLStreamException e) {
        throw new IOException(e);
      }
    }

    @Override
    public T getCurrent() throws NoSuchElementException {
      if (!readingStarted) {
        throw new NoSuchElementException();
      }
      return currentRecord;
    }

    @Override
    protected boolean isAtSplitPoint() {
      // Every record is at a split point.
      return true;
    }

    @Override
    protected long getCurrentOffset() {
      return currentByteOffset;
    }
  }
}