/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io.xml; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.SequenceInputStream; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.channels.Channels; import java.nio.channels.ReadableByteChannel; import java.nio.charset.StandardCharsets; import java.util.NoSuchElementException; import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBElement; import javax.xml.bind.JAXBException; import javax.xml.bind.Unmarshaller; import javax.xml.stream.FactoryConfigurationError; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.io.FileBasedSource; import org.apache.beam.sdk.io.fs.MatchResult.Metadata; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; import org.apache.beam.sdk.transforms.display.DisplayData; import org.codehaus.stax2.XMLInputFactory2; /** Implementation of {@link XmlIO#read}. */ public class XmlSource<T> extends FileBasedSource<T> { private static final String XML_VERSION = "1.1"; private final XmlIO.Read<T> spec; XmlSource(XmlIO.Read<T> spec) { super(StaticValueProvider.of(spec.getFileOrPatternSpec()), spec.getMinBundleSize()); this.spec = spec; } private XmlSource(XmlIO.Read<T> spec, Metadata metadata, long startOffset, long endOffset) { super(metadata, spec.getMinBundleSize(), startOffset, endOffset); this.spec = spec; } @Override protected FileBasedSource<T> createForSubrangeOfFile(Metadata metadata, long start, long end) { return new XmlSource<T>(spec.from(metadata.toString()), metadata, start, end); } @Override protected FileBasedReader<T> createSingleFileReader(PipelineOptions options) { return new XMLReader<T>(this); } @Override public void validate() { super.validate(); spec.validate(null); } @Override public void populateDisplayData(DisplayData.Builder builder) { spec.populateDisplayData(builder); } @Override public Coder<T> getDefaultOutputCoder() { return JAXBCoder.of(spec.getRecordClass()); } /** * A {@link Source.Reader} for reading JAXB annotated Java objects from an XML file. The XML * file should be of the form defined at {@link XmlSource}. * * <p>Timestamped values are currently unsupported - all values implicitly have the timestamp * of {@code BoundedWindow.TIMESTAMP_MIN_VALUE}. * * @param <T> Type of objects that will be read by the reader. */ private static class XMLReader<T> extends FileBasedReader<T> { // The amount of bytes read from the channel to memory when determining the starting offset of // the first record in a bundle. After matching to starting offset of the first record the // remaining bytes read to this buffer and the bytes still not read from the channel are used to // create the XML parser. private static final int BUF_SIZE = 1024; // This should be the maximum number of bytes a character will encode to, for any encoding // supported by XmlSource. Currently this is set to 4 since UTF-8 characters may be // four bytes. private static final int MAX_CHAR_BYTES = 4; // In order to support reading starting in the middle of an XML file, we construct an imaginary // well-formed document (a header and root tag followed by the contents of the input starting at // the record boundary) and feed it to the parser. Because of this, the offset reported by the // XML parser is not the same as offset in the original file. They differ by a constant amount: // offsetInOriginalFile = parser.getLocation().getCharacterOffset() + parserBaseOffset; // Note that this is true only for files with single-byte characters. // It appears that, as of writing, there does not exist a Java XML parser capable of correctly // reporting byte offsets of elements in the presence of multi-byte characters. private long parserBaseOffset = 0; private boolean readingStarted = false; // If true, the current bundle does not contain any records. private boolean emptyBundle = false; private Unmarshaller jaxbUnmarshaller = null; private XMLStreamReader parser = null; private T currentRecord = null; // Byte offset of the current record in the XML file provided when creating the source. private long currentByteOffset = 0; public XMLReader(XmlSource<T> source) { super(source); // Set up a JAXB Unmarshaller that can be used to unmarshall record objects. try { JAXBContext jaxbContext = JAXBContext.newInstance(getCurrentSource().spec.getRecordClass()); jaxbUnmarshaller = jaxbContext.createUnmarshaller(); if (getCurrentSource().spec.getValidationEventHandler() != null) { jaxbUnmarshaller.setEventHandler(getCurrentSource().spec.getValidationEventHandler()); } } catch (JAXBException e) { throw new RuntimeException(e); } } @Override public synchronized XmlSource<T> getCurrentSource() { return (XmlSource<T>) super.getCurrentSource(); } @Override protected void startReading(ReadableByteChannel channel) throws IOException { // This method determines the correct starting offset of the first record by reading bytes // from the ReadableByteChannel. This implementation does not need the channel to be a // SeekableByteChannel. // The method tries to determine the first record element in the byte channel. The first // record must start with the characters "<recordElement" where "recordElement" is the // record element of the XML document described above. For the match to be complete this // has to be followed by one of following. // * any whitespace character // * '>' character // * '/' character (to support empty records). // // After this match this method creates the XML parser for parsing the XML document, // feeding it a fake document consisting of an XML header and the <rootElement> tag followed // by the contents of channel starting from <recordElement. The <rootElement> tag may be never // closed. // This stores any bytes that should be used prior to the remaining bytes of the channel when // creating an XML parser object. ByteArrayOutputStream preambleByteBuffer = new ByteArrayOutputStream(); // A dummy declaration and root for the document with proper XML version and encoding. Without // this XML parsing may fail or may produce incorrect results. byte[] dummyStartDocumentBytes = (String.format( "<?xml version=\"%s\" encoding=\"" + getCurrentSource().spec.getCharset() + "\"?><%s>", XML_VERSION, getCurrentSource().spec.getRootElement())) .getBytes(getCurrentSource().spec.getCharset()); preambleByteBuffer.write(dummyStartDocumentBytes); // Gets the byte offset (in the input file) of the first record in ReadableByteChannel. This // method returns the offset and stores any bytes that should be used when creating the XML // parser in preambleByteBuffer. long offsetInFileOfRecordElement = getFirstOccurenceOfRecordElement(channel, preambleByteBuffer); if (offsetInFileOfRecordElement < 0) { // Bundle has no records. So marking this bundle as an empty bundle. emptyBundle = true; return; } else { byte[] preambleBytes = preambleByteBuffer.toByteArray(); currentByteOffset = offsetInFileOfRecordElement; setUpXMLParser(channel, preambleBytes); parserBaseOffset = offsetInFileOfRecordElement - dummyStartDocumentBytes.length; } readingStarted = true; } // Gets the first occurrence of the next record within the given ReadableByteChannel. Puts // any bytes read past the starting offset of the next record back to the preambleByteBuffer. // If a record is found, returns the starting offset of the record, otherwise // returns -1. private long getFirstOccurenceOfRecordElement( ReadableByteChannel channel, ByteArrayOutputStream preambleByteBuffer) throws IOException { int byteIndexInRecordElementToMatch = 0; // Index of the byte in the string "<recordElement" to be matched // against the current byte from the stream. boolean recordStartBytesMatched = false; // "<recordElement" matched. Still have to match the // next character to confirm if this is a positive match. boolean fullyMatched = false; // If true, record element was fully matched. // This gives the offset of the byte currently being read. We do a '-1' here since we // increment this value at the beginning of the while loop below. long offsetInFileOfCurrentByte = getCurrentSource().getStartOffset() - 1; long startingOffsetInFileOfCurrentMatch = -1; // If this is non-negative, currently there is a match in progress and this value gives the // starting offset of the match currently being conducted. boolean matchStarted = false; // If true, a match is currently in progress. // These two values are used to determine the character immediately following a match for // "<recordElement". Please see the comment for 'MAX_CHAR_BYTES' above. byte[] charBytes = new byte[MAX_CHAR_BYTES]; int charBytesFound = 0; ByteBuffer buf = ByteBuffer.allocate(BUF_SIZE); byte[] recordStartBytes = ("<" + getCurrentSource().spec.getRecordElement()).getBytes(StandardCharsets.UTF_8); outer: while (channel.read(buf) > 0) { buf.flip(); while (buf.hasRemaining()) { offsetInFileOfCurrentByte++; byte b = buf.get(); boolean reset = false; if (recordStartBytesMatched) { // We already matched "<recordElement" reading the next character to determine if this // is a positive match for a new record. charBytes[charBytesFound] = b; charBytesFound++; Character c = null; if (charBytesFound == charBytes.length) { CharBuffer charBuf = CharBuffer.allocate(1); InputStream charBufStream = new ByteArrayInputStream(charBytes); java.io.Reader reader = new InputStreamReader(charBufStream, StandardCharsets.UTF_8); int read = reader.read(); if (read <= 0) { return -1; } charBuf.flip(); c = (char) read; } else { continue; } // Record start may be of following forms // * "<recordElement<whitespace>..." // * "<recordElement>..." // * "<recordElement/..." if (Character.isWhitespace(c) || c == '>' || c == '/') { fullyMatched = true; // Add the recordStartBytes and charBytes to preambleByteBuffer since these were // already read from the channel. preambleByteBuffer.write(recordStartBytes); preambleByteBuffer.write(charBytes); // Also add the rest of the current buffer to preambleByteBuffer. while (buf.hasRemaining()) { preambleByteBuffer.write(buf.get()); } break outer; } else { // Matching was unsuccessful. Reset the buffer to include bytes read for the char. ByteBuffer newbuf = ByteBuffer.allocate(BUF_SIZE); newbuf.put(charBytes); offsetInFileOfCurrentByte -= charBytes.length; while (buf.hasRemaining()) { newbuf.put(buf.get()); } newbuf.flip(); buf = newbuf; // Ignore everything and try again starting from the current buffer. reset = true; } } else if (b == recordStartBytes[byteIndexInRecordElementToMatch]) { // Next byte matched. if (!matchStarted) { // Match was for the first byte, record the starting offset. matchStarted = true; startingOffsetInFileOfCurrentMatch = offsetInFileOfCurrentByte; } byteIndexInRecordElementToMatch++; } else { // Not a match. Ignore everything and try again starting at current point. reset = true; } if (reset) { // Clear variables and try to match starting from the next byte. byteIndexInRecordElementToMatch = 0; startingOffsetInFileOfCurrentMatch = -1; matchStarted = false; recordStartBytesMatched = false; charBytes = new byte[MAX_CHAR_BYTES]; charBytesFound = 0; } if (byteIndexInRecordElementToMatch == recordStartBytes.length) { // "<recordElement" matched. Need to still check next byte since this might be an // element that has "recordElement" as a prefix. recordStartBytesMatched = true; } } buf.clear(); } if (!fullyMatched) { return -1; } else { return startingOffsetInFileOfCurrentMatch; } } private void setUpXMLParser(ReadableByteChannel channel, byte[] lookAhead) throws IOException { try { // We use Woodstox because the StAX implementation provided by OpenJDK reports // character locations incorrectly. Note that Woodstox still currently reports *byte* // locations incorrectly when parsing documents that contain multi-byte characters. XMLInputFactory2 xmlInputFactory = (XMLInputFactory2) XMLInputFactory.newInstance(); this.parser = xmlInputFactory.createXMLStreamReader( new SequenceInputStream( new ByteArrayInputStream(lookAhead), Channels.newInputStream(channel)), getCurrentSource().spec.getCharset()); // Current offset should be the offset before reading the record element. while (true) { int event = parser.next(); if (event == XMLStreamConstants.START_ELEMENT) { String localName = parser.getLocalName(); if (localName.equals(getCurrentSource().spec.getRecordElement())) { break; } } } } catch (FactoryConfigurationError | XMLStreamException e) { throw new IOException(e); } } @Override protected boolean readNextRecord() throws IOException { if (emptyBundle) { currentByteOffset = Long.MAX_VALUE; return false; } try { // Update current offset and check if the next value is the record element. currentByteOffset = parserBaseOffset + parser.getLocation().getCharacterOffset(); while (parser.getEventType() != XMLStreamConstants.START_ELEMENT) { parser.next(); currentByteOffset = parserBaseOffset + parser.getLocation().getCharacterOffset(); if (parser.getEventType() == XMLStreamConstants.END_DOCUMENT) { currentByteOffset = Long.MAX_VALUE; return false; } } JAXBElement<T> jb = jaxbUnmarshaller.unmarshal(parser, getCurrentSource().spec.getRecordClass()); currentRecord = jb.getValue(); return true; } catch (JAXBException | XMLStreamException e) { throw new IOException(e); } } @Override public T getCurrent() throws NoSuchElementException { if (!readingStarted) { throw new NoSuchElementException(); } return currentRecord; } @Override protected boolean isAtSplitPoint() { // Every record is at a split point. return true; } @Override protected long getCurrentOffset() { return currentByteOffset; } } }