/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.piggybank.storage; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.CompressionInputStream; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.pig.LoadFunc; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.io.BufferedPositionedInputStream; import org.apache.tools.bzip2r.CBZip2InputStream; /** * A <code>XMLLoaderBufferedPositionedInputStream</code> is the package class and is the * decorator over the BufferedPositionedInputStream which in turn decorate * BufferedInputStream. It contains <code>BufferedPositionedInputStream<code> * input stream, which it uses as * its basic source of data, possibly reading or providing additional * functionality. The class <code>XMLLoaderBufferedPositionedInputStream</code> * itself simply overrides the necessary medthod for reading i.e * <code>read</code> <code>getPosition<code> with versions that * pass all requests to the contained input * stream or do some special processing. Subclasses of <code>XMLLoaderBufferedPositionedInputStream</code> * may further override some of these methods * and may also provide additional methods * and fields. * It also provides additional method <code>collectTag<collect> which will give the byte * array between the tag which is a xml record. i.e <tag> .*</tag> will be returned * * @note we can't use the standard SAX or STAX parser as for a big xml * the intermittent hadoop block may not be the valid xml and hence those * parser may create pb. * * @since pig 2.0 */ class XMLLoaderBufferedPositionedInputStream extends BufferedPositionedInputStream { public final static int S_START = 0; public final static int S_MATCH_PREFIX = 1; public final static int S_MATCH_TAG = 2; /** * The input streamed to be filtered */ InputStream wrapperIn; /** * The field to know if the underlying buffer contains any more bytes */ boolean _isReadable; /** * The field set the maximum bytes that is readable by this instance of stream. */ private long maxBytesReadable = 0; /** * The field denote the number of bytes read by this stream. */ long bytesRead = 0; /** * Denotes the end of the current split location */ long end = 0; /** * Creates a <code>XMLLoaderBufferedPositionedInputStream</code> * by assigning the argument <code>in</code> * to the field <code>this.wrapperIn</code> so as * to remember it for later use. * * @param in the underlying input stream, */ public XMLLoaderBufferedPositionedInputStream(InputStream in){ super(in); this.wrapperIn = in; setReadable(true); } /** * Creates a split aware <code>XMLLoaderBufferedPositionedInputStream</code>. * @param in the underlying input stream * @param start start location of the split * @param end end location of the split */ public XMLLoaderBufferedPositionedInputStream(InputStream in,long start,long end){ this(in); this.end = end; maxBytesReadable = end - start; } /** * Set the stream readable or non readable. This is needed * to control the xml parsing. * @param flag The boolean flag to be set * @see XMLLoaderBufferedPositionedInputStream#isReadable */ private void setReadable(boolean flag) { _isReadable = flag; } /** * See if the stream readable or non readable. This is needed * to control the xml parsing. * @return true if readable otherwise false * @see XMLLoaderBufferedPositionedInputStream#setReadable */ public boolean isReadable() { return _isReadable == true; } /** * org.apache.pig.impl.io.BufferedPositionedInputStream.read * It is just the wrapper for now. * Reads the next byte of data from this input stream. The value * byte is returned as an <code>int</code> in the range * <code>0</code> to <code>255</code>. If no byte is available * because the end of the stream has been reached, the value * <code>-1</code> is returned. This method blocks until input data * is available, the end of the stream is detected, or an exception * is thrown. * <p> * This method * simply performs <code>in.read()</code> and returns the result. * * @return the next byte of data, or <code>-1</code> if the end of the * stream is reached. * @exception IOException if an I/O error occurs. * @see XMLLoaderBufferedPositionedInputStreamInputStream#wrapperIn */ public int read() throws IOException { return wrapperIn.read(); } /** * This is collect the bytes from current position to the ending tag. * This scans for the tags and do the pattern match byte by byte * this must be used along with * XMLLoaderBufferedPositionedInputStream#skipToTag * * @param tagName the end tag to search for * * @param limit the end pointer for the block for this mapper * * @return the byte array containing the documents until the end of tag * * @see loader.XMLLoaderBufferedPositionedInputStream.collectUntilEndTag * */ private byte[] collectUntilEndTag(String tagName, long limit) { //@todo use the charset and get the charset encoding from the xml encoding. byte[] tmp = tagName.getBytes(); ByteArrayOutputStream collectBuf = new ByteArrayOutputStream(1024); // Levels of elements we went inside matched node int depth = 0; //Since skipToTag was called before this function, we know that we are //currently inside the matched tag. Assuming the XML file is well //structured, we read till we encounter the first close tag. Since //the matched element might contain nested element, we keep track of the //current depth and terminate only when we encounter a closing tag at //level zero // A flag to indicate the parsing is currently inside a (start/end) tag boolean insideTag = false; // A flag to indicate that the current tag is a closing (end) tag boolean closingTag = false; // Last byte read int last_b = -1; while (true) { int b = -1; try { b = this.read(); ++bytesRead; // Add one to the bytes read if (b == -1) { collectBuf.reset(); this.setReadable(false); break; } collectBuf.write((byte)(b)); // Check if the start tag has matched except for the last char if (b == '<') { insideTag = true; closingTag = false; } else if (b == '>') { // Detect the pattern /> if (last_b == '/') closingTag = true; insideTag = false; if (closingTag) { if (depth == 0) break; depth--; } } else if (b == '/' && last_b == '<') { // Detected the pattern </ closingTag = true; } else if (insideTag && last_b == '<') { // First character after '<' which is not a '/' depth++; } } catch (IOException e) { this.setReadable(false); return null; } last_b = b; } return collectBuf.toByteArray(); } /** * This is collect the from the matching tag. * This scans for the tags and do the pattern match byte by byte * This returns a part doc. it must be used along with * XMLLoaderBufferedPositionedInputStream#collectUntilEndTag * * @param tagName the start tag to search for * * @param limit the end pointer for the block for this mapper * * @return the byte array containing match of the tag. * * @see loader.XMLLoaderBufferedPositionedInputStream.collectUntilEndTag * */ private byte[] skipToTag(String tagName, long limit) throws IOException { //@todo use the charset and get the charset encoding from the xml encoding. byte[] tmp = tagName.getBytes(); byte[] tag = new byte[tmp.length + 1]; tag[0] = (byte)'<'; for (int i = 0; i < tmp.length; ++i) { tag[1+i] = tmp[i]; } ByteArrayOutputStream matchBuf = new ByteArrayOutputStream(512); int idxTagChar = 0; int state = S_START; /* * Read till the tag is found in this block. If a partial tag block is found * then continue on to the next block.matchBuf contains the data that is currently * matched. If the read has reached the end of split and there are matched data * then continue on to the next block. */ while (splitBoundaryCriteria(wrapperIn) || (matchBuf.size() > 0 )) { int b = -1; try { b = this.read(); ++bytesRead; // Increment the bytes read by 1 if (b == -1) { state = S_START; matchBuf.reset(); this.setReadable(false); break; } switch (state) { case S_START: // start to match the target open tag if (b == tag[idxTagChar]) { ++idxTagChar; matchBuf.write((byte)(b)); if (idxTagChar == tag.length) { state = S_MATCH_PREFIX; } } else { // mismatch idxTagChar = 0; matchBuf.reset(); } break; case S_MATCH_PREFIX: // tag match iff next character is whitespaces or close tag mark if (Character.isWhitespace(b) || b == '/' || b == '>') { matchBuf.write((byte)(b)); state = S_MATCH_TAG; } else { idxTagChar = 0; matchBuf.reset(); state = S_START; } break; case S_MATCH_TAG: // keep copy characters until we hit the close tag mark matchBuf.write((byte)(b)); break; default: throw new IllegalArgumentException("Invalid state: " + state); } if (state == S_MATCH_TAG && (b == '>' || Character.isWhitespace(b))) { break; } if (state != S_MATCH_TAG && this.getPosition() > limit) { // need to break, no record in this block break; } } catch (IOException e) { this.setReadable(false); return null; } } return matchBuf.toByteArray(); } /** * Returns whether the split boundary condition has reached or not. * For normal files ; the condition is to read till the split end reaches. * Gz files will have maxBytesReadable set to near Long.MAXVALUE, hence * this will cause the entire file to be read. For bz2 and bz files, the * condition lies on the position which until which it is read. * * @param wrapperIn2 * @return true/false depending on whether split boundary has reached or no * @throws IOException */ private boolean splitBoundaryCriteria(InputStream wrapperIn2) throws IOException { if(wrapperIn2 instanceof CBZip2InputStream) return ((CBZip2InputStream)wrapperIn2).getPos() <= end; else return bytesRead <= maxBytesReadable; } /** * This is collect bytes from start and end tag both inclusive * This scans for the tags and do the pattern match byte by byte * * @param tagName the start tag to search for * * @param limit the end pointer for the block for this mapper * * @return the byte array containing match of the <code><tag>.*</tag><code>. * * @see loader.XMLLoaderBufferedPositionedInputStream.skipToTag * * @see loader.XMLLoaderBufferedPositionedInputStream.collectUntilEndTag * */ byte[] collectTag(String tagName, long limit) throws IOException { ByteArrayOutputStream collectBuf = new ByteArrayOutputStream(1024); byte[] beginTag = skipToTag(tagName, limit); // Check if the tag is closed inline if (beginTag.length > 2 && beginTag[beginTag.length - 2] == '/' && beginTag[beginTag.length-1] == '>') { return beginTag; } // No need to search for the end tag if the start tag is not found if(beginTag.length > 0 ){ byte[] untilTag = collectUntilEndTag(tagName, limit); if (untilTag.length > 0) { for (byte b: beginTag) { collectBuf.write(b); } for (byte b: untilTag) { collectBuf.write(b); } } } return collectBuf.toByteArray(); } } /** * The load function to load the XML file * This implements the LoadFunc interface which is used to parse records * from a dataset. The various helper adaptor function is extended from loader.Utf8StorageConverter * which included various functions to cast raw byte data into various datatypes. * other sections of the code can call back to the loader to do the cast. * This takes a xmlTag as the arg which it will use to split the inputdataset into * multiple records. * <code> * * For example if the input xml (input.xml) is like this * <configuration> * <property> * <name> foobar </name> * <value> barfoo </value> * </property> * <ignoreProperty> * <name> foo </name> * </ignoreProperty> * <property> * <name> justname </name> * </property> * </configuration> * * And your pig script is like this * * --load the jar files * register /homes/aloks/pig/udfLib/loader.jar; * -- load the dataset using XMLLoader * -- A is the bag containing the tuple which contains one atom i.e doc see output * A = load '/user/aloks/pig/input.xml using loader.XMLLoader('property') as (doc:chararray); * --dump the result * dump A; * * * Then you will get the output * * (<property> * <name> foobar </name> * <value> barfoo </value> * </property>) * (<property> * <name> justname </name> * </property>) * * * Where each () indicate one record * * * </code> */ public class XMLLoader extends LoadFunc { /** * logger from pig */ protected final Log mLog = LogFactory.getLog(getClass()); private XMLFileRecordReader reader = null; /** * the tuple content which is used while returning */ private ArrayList<Object> mProtoTuple = null; /** * The record seperated. The default value is 'document' */ public String recordIdentifier = "document"; private String loadLocation; public XMLLoader() { } /** * Constructs a Pig loader that uses specified string as the record seperater * for example if the recordIdentifier is document. It will consider the record as * <document> .* </document> * * @param recordIdentifier the xml tag which is used to pull records * */ public XMLLoader(String recordIdentifier) { this(); this.recordIdentifier = recordIdentifier; } /** * Retrieves the next tuple to be processed. * @return the next tuple to be processed or null if there are no more tuples * to be processed. * @throws IOException */ @Override public Tuple getNext() throws IOException { boolean next = false; try { next = reader.nextKeyValue(); } catch (InterruptedException e) { throw new IOException(e); } if (!next) return null; Tuple t = null; try { byte[] tagContent = (byte[]) reader.getCurrentValue(); // No need to create the tuple if there are no contents t = (tagContent.length > 0) ? createTuple(tagContent) : null; } catch (Exception e) { throw new IOException(e); } return t; } public Tuple createTuple(byte[] content) throws Exception { if (mProtoTuple == null) { mProtoTuple = new ArrayList<Object>(); } if (content.length > 0) { mProtoTuple.add(new DataByteArray(content)); } Tuple t = TupleFactory.getInstance().newTupleNoCopy(mProtoTuple); mProtoTuple = null; return t; } /** * to check for equality * @param object */ public boolean equals(Object obj) { return equals((XMLLoader)obj); } /** * to check for equality * @param XMLLoader object */ public boolean equals(XMLLoader other) { return this.recordIdentifier.equals(other.recordIdentifier); } @SuppressWarnings("unchecked") @Override public InputFormat getInputFormat() throws IOException { XMLFileInputFormat inputFormat = new XMLFileInputFormat(recordIdentifier); if(loadLocation.endsWith(".bz2") || loadLocation.endsWith(".bz")) { inputFormat.isSplitable = true; } return inputFormat; } @SuppressWarnings("unchecked") @Override public void prepareToRead(RecordReader reader, PigSplit split) throws IOException { this.reader = (XMLFileRecordReader) reader; } @Override public void setLocation(String location, Job job) throws IOException { loadLocation = location; FileInputFormat.setInputPaths(job, location); } //------------------------------------------------------------------------ // Implementation of InputFormat public static class XMLFileInputFormat extends FileInputFormat { /** * Boolean flag used to identify whether splittable property is explicitly set. */ private boolean isSplitable = false; private String recordIdentifier; public XMLFileInputFormat(String recordIdentifier) { this.recordIdentifier = recordIdentifier; } @SuppressWarnings("unchecked") @Override public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { return new XMLFileRecordReader(recordIdentifier); } @Override protected boolean isSplitable(JobContext context, Path filename) { CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename); return (!(codec == null)) ? isSplitable : true; } } //------------------------------------------------------------------------ // Implementation of RecordReader public static class XMLFileRecordReader extends RecordReader { private long start; private long end; private String recordIdentifier; /* * xmlloader input stream which has the ability to split the input * dataset into records by the specified tag */ private XMLLoaderBufferedPositionedInputStream xmlLoaderBPIS = null; public XMLFileRecordReader(String recordIdentifier) { this.recordIdentifier = recordIdentifier; } @Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); // Seek to the start of the file fileIn.seek(start); if(file.toString().endsWith(".bz2") || file.toString().endsWith(".bz")) { // For bzip2 files use CBZip2InputStream to read and supply the upper input stream. CBZip2InputStream in = new CBZip2InputStream(fileIn,9, end); this.xmlLoaderBPIS = new XMLLoaderBufferedPositionedInputStream(in,start,end); } else if (file.toString().endsWith(".gz")) { CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec != null) { end = Long.MAX_VALUE; CompressionInputStream stream = codec.createInputStream(fileIn); this.xmlLoaderBPIS = new XMLLoaderBufferedPositionedInputStream(stream,start,end); } } else { this.xmlLoaderBPIS = new XMLLoaderBufferedPositionedInputStream(fileIn,start,end); } } @Override public void close() throws IOException { xmlLoaderBPIS.close(); } @Override public Object getCurrentKey() throws IOException, InterruptedException { return null; } @Override public Object getCurrentValue() throws IOException, InterruptedException { return xmlLoaderBPIS.collectTag(recordIdentifier, end); } @Override public float getProgress() throws IOException, InterruptedException { return 0; } @Override public boolean nextKeyValue() throws IOException, InterruptedException { return xmlLoaderBPIS.isReadable(); } } }