/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.piggybank.storage;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.CompressionInputStream;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.pig.LoadFunc;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.io.BufferedPositionedInputStream;
import org.apache.tools.bzip2r.CBZip2InputStream;
/**
* A <code>XMLLoaderBufferedPositionedInputStream</code> is the package class and is the
* decorator over the BufferedPositionedInputStream which in turn decorate
* BufferedInputStream. It contains <code>BufferedPositionedInputStream<code>
* input stream, which it uses as
* its basic source of data, possibly reading or providing additional
* functionality. The class <code>XMLLoaderBufferedPositionedInputStream</code>
* itself simply overrides the necessary medthod for reading i.e
* <code>read</code> <code>getPosition<code> with versions that
* pass all requests to the contained input
* stream or do some special processing. Subclasses of <code>XMLLoaderBufferedPositionedInputStream</code>
* may further override some of these methods
* and may also provide additional methods
* and fields.
* It also provides additional method <code>collectTag<collect> which will give the byte
* array between the tag which is a xml record. i.e <tag> .*</tag> will be returned
*
* @note we can't use the standard SAX or STAX parser as for a big xml
* the intermittent hadoop block may not be the valid xml and hence those
* parser may create pb.
*
* @since pig 2.0
*/
class XMLLoaderBufferedPositionedInputStream extends BufferedPositionedInputStream {
public final static int S_START = 0;
public final static int S_MATCH_PREFIX = 1;
public final static int S_MATCH_TAG = 2;
/**
* The input streamed to be filtered
*/
InputStream wrapperIn;
/**
* The field to know if the underlying buffer contains any more bytes
*/
boolean _isReadable;
/**
* The field set the maximum bytes that is readable by this instance of stream.
*/
private long maxBytesReadable = 0;
/**
* The field denote the number of bytes read by this stream.
*/
long bytesRead = 0;
/**
* Denotes the end of the current split location
*/
long end = 0;
/**
* Creates a <code>XMLLoaderBufferedPositionedInputStream</code>
* by assigning the argument <code>in</code>
* to the field <code>this.wrapperIn</code> so as
* to remember it for later use.
*
* @param in the underlying input stream,
*/
public XMLLoaderBufferedPositionedInputStream(InputStream in){
super(in);
this.wrapperIn = in;
setReadable(true);
}
/**
* Creates a split aware <code>XMLLoaderBufferedPositionedInputStream</code>.
* @param in the underlying input stream
* @param start start location of the split
* @param end end location of the split
*/
public XMLLoaderBufferedPositionedInputStream(InputStream in,long start,long end){
this(in);
this.end = end;
maxBytesReadable = end - start;
}
/**
* Set the stream readable or non readable. This is needed
* to control the xml parsing.
* @param flag The boolean flag to be set
* @see XMLLoaderBufferedPositionedInputStream#isReadable
*/
private void setReadable(boolean flag) {
_isReadable = flag;
}
/**
* See if the stream readable or non readable. This is needed
* to control the xml parsing.
* @return true if readable otherwise false
* @see XMLLoaderBufferedPositionedInputStream#setReadable
*/
public boolean isReadable() {
return _isReadable == true;
}
/**
* org.apache.pig.impl.io.BufferedPositionedInputStream.read
* It is just the wrapper for now.
* Reads the next byte of data from this input stream. The value
* byte is returned as an <code>int</code> in the range
* <code>0</code> to <code>255</code>. If no byte is available
* because the end of the stream has been reached, the value
* <code>-1</code> is returned. This method blocks until input data
* is available, the end of the stream is detected, or an exception
* is thrown.
* <p>
* This method
* simply performs <code>in.read()</code> and returns the result.
*
* @return the next byte of data, or <code>-1</code> if the end of the
* stream is reached.
* @exception IOException if an I/O error occurs.
* @see XMLLoaderBufferedPositionedInputStreamInputStream#wrapperIn
*/
public int read() throws IOException {
return wrapperIn.read();
}
/**
* This is collect the bytes from current position to the ending tag.
* This scans for the tags and do the pattern match byte by byte
* this must be used along with
* XMLLoaderBufferedPositionedInputStream#skipToTag
*
* @param tagName the end tag to search for
*
* @param limit the end pointer for the block for this mapper
*
* @return the byte array containing the documents until the end of tag
*
* @see loader.XMLLoaderBufferedPositionedInputStream.collectUntilEndTag
*
*/
private byte[] collectUntilEndTag(String tagName, long limit) {
//@todo use the charset and get the charset encoding from the xml encoding.
byte[] tmp = tagName.getBytes();
ByteArrayOutputStream collectBuf = new ByteArrayOutputStream(1024);
// Levels of elements we went inside matched node
int depth = 0;
//Since skipToTag was called before this function, we know that we are
//currently inside the matched tag. Assuming the XML file is well
//structured, we read till we encounter the first close tag. Since
//the matched element might contain nested element, we keep track of the
//current depth and terminate only when we encounter a closing tag at
//level zero
// A flag to indicate the parsing is currently inside a (start/end) tag
boolean insideTag = false;
// A flag to indicate that the current tag is a closing (end) tag
boolean closingTag = false;
// Last byte read
int last_b = -1;
while (true) {
int b = -1;
try {
b = this.read();
++bytesRead; // Add one to the bytes read
if (b == -1) {
collectBuf.reset();
this.setReadable(false);
break;
}
collectBuf.write((byte)(b));
// Check if the start tag has matched except for the last char
if (b == '<') {
insideTag = true;
closingTag = false;
} else if (b == '>') {
// Detect the pattern />
if (last_b == '/')
closingTag = true;
insideTag = false;
if (closingTag) {
if (depth == 0)
break;
depth--;
}
} else if (b == '/' && last_b == '<') {
// Detected the pattern </
closingTag = true;
} else if (insideTag && last_b == '<') {
// First character after '<' which is not a '/'
depth++;
}
}
catch (IOException e) {
this.setReadable(false);
return null;
}
last_b = b;
}
return collectBuf.toByteArray();
}
/**
* This is collect the from the matching tag.
* This scans for the tags and do the pattern match byte by byte
* This returns a part doc. it must be used along with
* XMLLoaderBufferedPositionedInputStream#collectUntilEndTag
*
* @param tagName the start tag to search for
*
* @param limit the end pointer for the block for this mapper
*
* @return the byte array containing match of the tag.
*
* @see loader.XMLLoaderBufferedPositionedInputStream.collectUntilEndTag
*
*/
private byte[] skipToTag(String tagName, long limit) throws IOException {
//@todo use the charset and get the charset encoding from the xml encoding.
byte[] tmp = tagName.getBytes();
byte[] tag = new byte[tmp.length + 1];
tag[0] = (byte)'<';
for (int i = 0; i < tmp.length; ++i) {
tag[1+i] = tmp[i];
}
ByteArrayOutputStream matchBuf = new ByteArrayOutputStream(512);
int idxTagChar = 0;
int state = S_START;
/*
* Read till the tag is found in this block. If a partial tag block is found
* then continue on to the next block.matchBuf contains the data that is currently
* matched. If the read has reached the end of split and there are matched data
* then continue on to the next block.
*/
while (splitBoundaryCriteria(wrapperIn) || (matchBuf.size() > 0 )) {
int b = -1;
try {
b = this.read();
++bytesRead; // Increment the bytes read by 1
if (b == -1) {
state = S_START;
matchBuf.reset();
this.setReadable(false);
break;
}
switch (state) {
case S_START:
// start to match the target open tag
if (b == tag[idxTagChar]) {
++idxTagChar;
matchBuf.write((byte)(b));
if (idxTagChar == tag.length) {
state = S_MATCH_PREFIX;
}
} else { // mismatch
idxTagChar = 0;
matchBuf.reset();
}
break;
case S_MATCH_PREFIX:
// tag match iff next character is whitespaces or close tag mark
if (Character.isWhitespace(b) || b == '/' || b == '>') {
matchBuf.write((byte)(b));
state = S_MATCH_TAG;
} else {
idxTagChar = 0;
matchBuf.reset();
state = S_START;
}
break;
case S_MATCH_TAG:
// keep copy characters until we hit the close tag mark
matchBuf.write((byte)(b));
break;
default:
throw new IllegalArgumentException("Invalid state: " + state);
}
if (state == S_MATCH_TAG && (b == '>' || Character.isWhitespace(b))) {
break;
}
if (state != S_MATCH_TAG && this.getPosition() > limit) {
// need to break, no record in this block
break;
}
}
catch (IOException e) {
this.setReadable(false);
return null;
}
}
return matchBuf.toByteArray();
}
/**
* Returns whether the split boundary condition has reached or not.
* For normal files ; the condition is to read till the split end reaches.
* Gz files will have maxBytesReadable set to near Long.MAXVALUE, hence
* this will cause the entire file to be read. For bz2 and bz files, the
* condition lies on the position which until which it is read.
*
* @param wrapperIn2
* @return true/false depending on whether split boundary has reached or no
* @throws IOException
*/
private boolean splitBoundaryCriteria(InputStream wrapperIn2) throws IOException {
if(wrapperIn2 instanceof CBZip2InputStream)
return ((CBZip2InputStream)wrapperIn2).getPos() <= end;
else
return bytesRead <= maxBytesReadable;
}
/**
* This is collect bytes from start and end tag both inclusive
* This scans for the tags and do the pattern match byte by byte
*
* @param tagName the start tag to search for
*
* @param limit the end pointer for the block for this mapper
*
* @return the byte array containing match of the <code><tag>.*</tag><code>.
*
* @see loader.XMLLoaderBufferedPositionedInputStream.skipToTag
*
* @see loader.XMLLoaderBufferedPositionedInputStream.collectUntilEndTag
*
*/
byte[] collectTag(String tagName, long limit) throws IOException {
ByteArrayOutputStream collectBuf = new ByteArrayOutputStream(1024);
byte[] beginTag = skipToTag(tagName, limit);
// Check if the tag is closed inline
if (beginTag.length > 2 && beginTag[beginTag.length - 2] == '/' &&
beginTag[beginTag.length-1] == '>') {
return beginTag;
}
// No need to search for the end tag if the start tag is not found
if(beginTag.length > 0 ){
byte[] untilTag = collectUntilEndTag(tagName, limit);
if (untilTag.length > 0) {
for (byte b: beginTag) {
collectBuf.write(b);
}
for (byte b: untilTag) {
collectBuf.write(b);
}
}
}
return collectBuf.toByteArray();
}
}
/**
* The load function to load the XML file
* This implements the LoadFunc interface which is used to parse records
* from a dataset. The various helper adaptor function is extended from loader.Utf8StorageConverter
* which included various functions to cast raw byte data into various datatypes.
* other sections of the code can call back to the loader to do the cast.
* This takes a xmlTag as the arg which it will use to split the inputdataset into
* multiple records.
* <code>
*
* For example if the input xml (input.xml) is like this
* <configuration>
* <property>
* <name> foobar </name>
* <value> barfoo </value>
* </property>
* <ignoreProperty>
* <name> foo </name>
* </ignoreProperty>
* <property>
* <name> justname </name>
* </property>
* </configuration>
*
* And your pig script is like this
*
* --load the jar files
* register /homes/aloks/pig/udfLib/loader.jar;
* -- load the dataset using XMLLoader
* -- A is the bag containing the tuple which contains one atom i.e doc see output
* A = load '/user/aloks/pig/input.xml using loader.XMLLoader('property') as (doc:chararray);
* --dump the result
* dump A;
*
*
* Then you will get the output
*
* (<property>
* <name> foobar </name>
* <value> barfoo </value>
* </property>)
* (<property>
* <name> justname </name>
* </property>)
*
*
* Where each () indicate one record
*
*
* </code>
*/
public class XMLLoader extends LoadFunc {
/**
* logger from pig
*/
protected final Log mLog = LogFactory.getLog(getClass());
private XMLFileRecordReader reader = null;
/**
* the tuple content which is used while returning
*/
private ArrayList<Object> mProtoTuple = null;
/**
* The record seperated. The default value is 'document'
*/
public String recordIdentifier = "document";
private String loadLocation;
public XMLLoader() {
}
/**
* Constructs a Pig loader that uses specified string as the record seperater
* for example if the recordIdentifier is document. It will consider the record as
* <document> .* </document>
*
* @param recordIdentifier the xml tag which is used to pull records
*
*/
public XMLLoader(String recordIdentifier) {
this();
this.recordIdentifier = recordIdentifier;
}
/**
* Retrieves the next tuple to be processed.
* @return the next tuple to be processed or null if there are no more tuples
* to be processed.
* @throws IOException
*/
@Override
public Tuple getNext() throws IOException {
boolean next = false;
try {
next = reader.nextKeyValue();
} catch (InterruptedException e) {
throw new IOException(e);
}
if (!next) return null;
Tuple t = null;
try {
byte[] tagContent = (byte[]) reader.getCurrentValue();
// No need to create the tuple if there are no contents
t = (tagContent.length > 0) ? createTuple(tagContent) : null;
} catch (Exception e) {
throw new IOException(e);
}
return t;
}
public Tuple createTuple(byte[] content) throws Exception {
if (mProtoTuple == null) {
mProtoTuple = new ArrayList<Object>();
}
if (content.length > 0) {
mProtoTuple.add(new DataByteArray(content));
}
Tuple t = TupleFactory.getInstance().newTupleNoCopy(mProtoTuple);
mProtoTuple = null;
return t;
}
/**
* to check for equality
* @param object
*/
public boolean equals(Object obj) {
return equals((XMLLoader)obj);
}
/**
* to check for equality
* @param XMLLoader object
*/
public boolean equals(XMLLoader other) {
return this.recordIdentifier.equals(other.recordIdentifier);
}
@SuppressWarnings("unchecked")
@Override
public InputFormat getInputFormat() throws IOException {
XMLFileInputFormat inputFormat = new XMLFileInputFormat(recordIdentifier);
if(loadLocation.endsWith(".bz2") || loadLocation.endsWith(".bz")) {
inputFormat.isSplitable = true;
}
return inputFormat;
}
@SuppressWarnings("unchecked")
@Override
public void prepareToRead(RecordReader reader, PigSplit split)
throws IOException {
this.reader = (XMLFileRecordReader) reader;
}
@Override
public void setLocation(String location, Job job) throws IOException {
loadLocation = location;
FileInputFormat.setInputPaths(job, location);
}
//------------------------------------------------------------------------
// Implementation of InputFormat
public static class XMLFileInputFormat extends FileInputFormat {
/**
* Boolean flag used to identify whether splittable property is explicitly set.
*/
private boolean isSplitable = false;
private String recordIdentifier;
public XMLFileInputFormat(String recordIdentifier) {
this.recordIdentifier = recordIdentifier;
}
@SuppressWarnings("unchecked")
@Override
public RecordReader createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException,
InterruptedException {
return new XMLFileRecordReader(recordIdentifier);
}
@Override
protected boolean isSplitable(JobContext context, Path filename) {
CompressionCodec codec =
new CompressionCodecFactory(context.getConfiguration()).getCodec(filename);
return (!(codec == null)) ? isSplitable : true;
}
}
//------------------------------------------------------------------------
// Implementation of RecordReader
public static class XMLFileRecordReader extends RecordReader {
private long start;
private long end;
private String recordIdentifier;
/*
* xmlloader input stream which has the ability to split the input
* dataset into records by the specified tag
*/
private XMLLoaderBufferedPositionedInputStream xmlLoaderBPIS = null;
public XMLFileRecordReader(String recordIdentifier) {
this.recordIdentifier = recordIdentifier;
}
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit split = (FileSplit) genericSplit;
Configuration job = context.getConfiguration();
start = split.getStart();
end = start + split.getLength();
final Path file = split.getPath();
// open the file and seek to the start of the split
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
// Seek to the start of the file
fileIn.seek(start);
if(file.toString().endsWith(".bz2") || file.toString().endsWith(".bz"))
{
// For bzip2 files use CBZip2InputStream to read and supply the upper input stream.
CBZip2InputStream in = new CBZip2InputStream(fileIn,9, end);
this.xmlLoaderBPIS = new XMLLoaderBufferedPositionedInputStream(in,start,end);
}
else if (file.toString().endsWith(".gz"))
{
CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
final CompressionCodec codec = compressionCodecs.getCodec(file);
if (codec != null) {
end = Long.MAX_VALUE;
CompressionInputStream stream = codec.createInputStream(fileIn);
this.xmlLoaderBPIS = new XMLLoaderBufferedPositionedInputStream(stream,start,end);
}
}
else
{
this.xmlLoaderBPIS = new XMLLoaderBufferedPositionedInputStream(fileIn,start,end);
}
}
@Override
public void close() throws IOException {
xmlLoaderBPIS.close();
}
@Override
public Object getCurrentKey() throws IOException, InterruptedException {
return null;
}
@Override
public Object getCurrentValue() throws IOException,
InterruptedException {
return xmlLoaderBPIS.collectTag(recordIdentifier, end);
}
@Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
return xmlLoaderBPIS.isReadable();
}
}
}