/** * (c) Copyright 2013 WibiData, Inc. * * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kiji.mapreduce.input.impl; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import com.google.common.base.Preconditions; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.kiji.annotations.ApiAudience; /** * An {@link org.apache.hadoop.mapreduce.InputFormat} for formatted XML files. XML files are read * line by line searching for record start and end tags. Keys are the byte offset in the file of * the start of record opening tags. Values are the contents of the file between start and end tags * (inclusive). A record will contain exactly <code><record>contents</record></code> * with any preceding or trailing whitespace removed. Comments and CDATA containing record tags * will be read as valid tags and may cause the record reader to return invalid records. * * This Input format can be used with the stock * {@link org.kiji.mapreduce.lib.bulkimport.XMLBulkImporter}. */ @ApiAudience.Private public final class XMLInputFormat extends FileInputFormat<LongWritable, Text> { private static final Logger LOG = LoggerFactory.getLogger(XMLInputFormat.class); /** Configuration key for XML tag to start and end records. */ public static final String RECORD_TAG_CONF_KEY = "kiji.input.xml.record.tag"; /** Configuration key for XML version and encoding information. */ public static final String XML_HEADER_CONF_KEY = "kiji.input.xml.header"; /** * Configuration key for setting the maximum number of bytes the record reader may read beyond * the end of a split. Default value is equal to the size of the split. */ public static final String XML_OVERRUN_CONF_KEY = "kiji.input.xml.overrun.allowance"; /** {@inheritDoc} */ @Override public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { XMLRecordReader reader = new XMLRecordReader(); reader.initialize(split, context); return reader; } /** * <p> * A {@link org.apache.hadoop.mapreduce.RecordReader} for parsing XML records. Seeks until it * finds the user specified start tag, captures until it finds the user specified end tag, and * returns a <code>Text</code> object containing an entire XML record. * </p> * <p> * XMLRecordReader is package private for testing purposes only and should not be accessed * externally. * </p> */ @ApiAudience.Private static final class XMLRecordReader extends RecordReader<LongWritable, Text> { private static final String DEFAULT_XML_HEADER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"; /** The byte offset within the file of the start of the current file split. */ private long mStartOffset; /** The byte offset within the file of the end of the current file split (exclusive). */ private long mEndOffset; /** The current byte offset within the file. */ private long mCurrentOffset; /** * The maximum bytes the reader may read beyond the end of the split when searching for the * end of a record. */ private long mOverrunAllowance; /** A line reader for the input file. */ private BufferedReader mReader; /** Byte offset of the current record. */ private LongWritable mCurrentKey; /** Value of the current record. */ private Text mCurrentValue; /** Tag that marks the beginning of a record. */ private char[] mRecordBeginChars; /** Tag that marks the end of a record. */ private char[] mRecordEndChars; /** XML header for each record. */ private String mHeader; /** StringBuilder holding partial record. */ private StringBuilder mRecordBuilder; /** {@inheritDoc} */ @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = downcast(split); Configuration conf = context.getConfiguration(); // Initialize the start and end offsets for the current split. mStartOffset = fileSplit.getStart(); mEndOffset = mStartOffset + fileSplit.getLength(); mOverrunAllowance = conf.getLong(XML_OVERRUN_CONF_KEY, fileSplit.getLength()); // Open the file containing the input split. FileSystem fileSystem = fileSplit.getPath().getFileSystem(conf); FSDataInputStream fileInputStream = fileSystem.open(fileSplit.getPath()); // Seek to the beginning of the input split. fileInputStream.seek(mStartOffset); mCurrentOffset = mStartOffset; mReader = new BufferedReader(new InputStreamReader(fileInputStream, "utf-8")); // Initialize Key and Value. mCurrentKey = new LongWritable(); mCurrentValue = new Text(); // Set begin and end tag values. mRecordBeginChars = String.format("<%s", Preconditions.checkNotNull( conf.get(RECORD_TAG_CONF_KEY), "Record tag may not be null. Specify a record tag in " + "the configuration with key: " + RECORD_TAG_CONF_KEY)).toCharArray(); mRecordEndChars = String.format("</%s>", Preconditions.checkNotNull( conf.get(RECORD_TAG_CONF_KEY), "Record tag may not be null. Specify a record tag in " + "the configuration with key: " + RECORD_TAG_CONF_KEY)).toCharArray(); mHeader = conf.get(XML_HEADER_CONF_KEY, DEFAULT_XML_HEADER); } /** * Downcast an InputSplit to a FileSplit. * * @param split The InputSplit. * @return The FileSplit. */ private FileSplit downcast(InputSplit split) { Preconditions.checkArgument(split instanceof FileSplit, String.format("Only %s is supported, but found %s", FileSplit.class, split.getClass())); return (FileSplit) split; } /** * Find the start of a record given the current field values. * * @return Whether the beginning of a record was found. * @throws IOException in case of an IO error. */ private boolean findRecordStart() throws IOException { return findRecordStart( mRecordBeginChars, mStartOffset, mEndOffset, mReader, mCurrentKey, mRecordBuilder ); } /** * Seeks into a split until it finds a given record delimiting start string. * Package private for testing purposes only, should not be called externally. * * @param recordBeginChars A char array of the record delimiting tag. This consists of the * opening < character for the tag and the name of the entity itself; it does not include * the trailing > character because a tag may contain attributes. For example, for a * record consisting of <tt><user>contents<user></tt>, this will contain an array * of characters representing "<user". * @param startOffset Byte offset in the file of the beginning of the split. * @param endOffset Byte offset in the file of the end of the split. * @param reader BufferedReader on the input data. * @param currentKey A LongWritable to be set to the start of a found record. * @param recordBuilder A StringBuilder containing the partially formed record. The record * start tag will be appended to the StringBuilder if this method find the start of a record * (i.e. returns true). * @return True if the beginning of a record has been found, false if it reaches the end of the * file. * @throws IOException in case of an IO error. */ boolean findRecordStart( char[] recordBeginChars, long startOffset, long endOffset, BufferedReader reader, LongWritable currentKey, StringBuilder recordBuilder ) throws IOException { final int recordBeginLength = recordBeginChars.length; // Have we found the start of a record? boolean foundBeginTag = false; // Index of the next unmatched character in mRecordBeginChars. int matchRecordBeginIndex = 0; // Next record character to match. char nextCharToMatch = recordBeginChars[matchRecordBeginIndex]; // Seek until you find a record begin tag. while (!foundBeginTag) { // Break if past the end of the split only if a record has not been partially matched. // Protects against the case that a record begin tag falls across a split boundary. if (matchRecordBeginIndex == 0 && mCurrentOffset >= endOffset) { return false; } // Read the next char. int nextChar = reader.read(); // Return false if we reach EOF without opening a record. if (nextChar == -1) { return false; } final char currentChar = (char) nextChar; mCurrentOffset++; // If the next char is next in the record begin tag, increment the match index. if (currentChar == nextCharToMatch) { matchRecordBeginIndex++; // If we've matched the entire record begin tag, match against '>' and whitespace. if (matchRecordBeginIndex == recordBeginLength) { nextChar = reader.read(); // Return false if we reach EOF without opening a record. if (nextChar == -1) { return false; } final char recordValidatorChar = (char) nextChar; mCurrentOffset++; // If the character following the begin tag is valid, save it and flag a record start. if (recordValidatorChar == '>' || Character.isWhitespace(recordValidatorChar)) { foundBeginTag = true; // Set the current key to the beginning of the record begin tag. Value is the // current location in the file, minus the length of the record begin tag minus one // for the extra character following the record begin tag. currentKey.set(startOffset + mCurrentOffset - recordBeginLength - 1); // Add the record begin tag to the StringBuilder holding the partial record. recordBuilder.append(recordBeginChars).append(recordValidatorChar); } else { // If we have matched the entire begin tag, but it is followed by an invalid char, // reset the matcher. For example, if you are searching for records beginning with // "<foo", "<food>" should not match because 'd' is neither '>' nor whitespace. matchRecordBeginIndex = 0; nextCharToMatch = recordBeginChars[matchRecordBeginIndex]; } } else { // If we haven't matched the entire record begin tag, increment the match char. nextCharToMatch = recordBeginChars[matchRecordBeginIndex]; } // If the next char is not next in the record begin tag, reset the matcher. } else { matchRecordBeginIndex = 0; nextCharToMatch = recordBeginChars[matchRecordBeginIndex]; } } return foundBeginTag; } /** * Find the end of a record using the current field values. * * @return True if the end of a record is found, false if the reader reaches the end of the * file. * @throws IOException in case of an IO error. */ private boolean findRecordEnd() throws IOException { return findRecordEnd( mRecordEndChars, mReader, mEndOffset, mOverrunAllowance, mRecordBuilder, mCurrentValue ); } /** * Seeks into a split until it finds a given record delimiting end string. * Package private for testing purposes only, should not be called externally. * * @param recordEndChars A char array of the record delimiting tag. For example, a user record * should end with <tt></;user></tt>. * @param endOffset Byte offset of the end of the split. * @param overrunAllowance Number of bytes beyond the end of the split the reader may look for * the end of an open record. * @param reader BufferedReader on the input data. * @param currentValue A Text to be set to the contents of an entire record. * @param recordBuilder A StringBuilder containing the partially formed record. Each character * read by the BufferedReader will be appended to the StringBuilder. * @return True if the end of a record has been found, false if it reaches the end of the * file or exceeds the overrun allowance. * @throws IOException in case of an IO error. */ boolean findRecordEnd( char[] recordEndChars, BufferedReader reader, long endOffset, long overrunAllowance, StringBuilder recordBuilder, Text currentValue ) throws IOException { final int recordEndLength = recordEndChars.length; // Record tag matcher. // Index of the next unmatched character in mRecordEndChars int matchRecordEndIndex = 0; // Next record character to match. char nextCharToMatch = recordEndChars[matchRecordEndIndex]; // Seek until you find a record end tag or exceed the split overrun allowance. while (mCurrentOffset <= endOffset + overrunAllowance) { // Read the next char and add it to the record output. final int nextChar = reader.read(); // Return false if we reach EOF without closing the record. if (nextChar == -1) { return false; } final char currentChar = (char) nextChar; mCurrentOffset++; recordBuilder.append(currentChar); // If the next char is next in the record end tag, increment the matcher. if (currentChar == nextCharToMatch) { matchRecordEndIndex++; // If we've matched the entire record end tag, set the currentValue and return. if (matchRecordEndIndex == recordEndLength) { currentValue.set(recordBuilder.toString()); return true; } else { // If we haven't matched the entire record end tag, increment the matcher. nextCharToMatch = recordEndChars[matchRecordEndIndex]; } } else { //If the next char is not next in the record end tag, reset the matcher. matchRecordEndIndex = 0; nextCharToMatch = recordEndChars[matchRecordEndIndex]; } } // If we exceed the overrunAllowance, return false. return false; } /** {@inheritDoc} */ @Override public boolean nextKeyValue() throws IOException, InterruptedException { mRecordBuilder = new StringBuilder().append(mHeader); return findRecordStart() && findRecordEnd(); } @Override public LongWritable getCurrentKey() throws IOException, InterruptedException { return mCurrentKey; } @Override public Text getCurrentValue() throws IOException, InterruptedException { return mCurrentValue; } @Override public float getProgress() throws IOException, InterruptedException { assert mEndOffset > mStartOffset; final long bytesTotal = mEndOffset - mStartOffset; final long bytesProcessed = Math.max(0L, mCurrentOffset - mStartOffset); return (float) bytesProcessed / (float) bytesTotal; } @Override public void close() throws IOException { mReader.close(); } } }