/**
* CloudGraph Community Edition (CE) License
*
* This is a community release of CloudGraph, a dual-license suite of
* Service Data Object (SDO) 2.1 services designed for relational and
* big-table style "cloud" databases, such as HBase and others.
* This particular copy of the software is released under the
* version 2 of the GNU General Public License. CloudGraph was developed by
* TerraMeta Software, Inc.
*
* Copyright (c) 2013, TerraMeta Software, Inc. All rights reserved.
*
* General License information can be found below.
*
* This distribution may include materials developed by third
* parties. For license and attribution notices for these
* materials, please refer to the documentation that accompanies
* this distribution (see the "Licenses for Third-Party Components"
* appendix) or view the online documentation at
* <http://cloudgraph.org/licenses/>.
*/
package org.cloudgraph.mapreduce;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.lang.reflect.Method;
import javax.xml.stream.XMLStreamException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.StringUtils;
import org.cloudgraph.common.CloudGraphConstants;
import org.cloudgraph.store.service.MetricCollector;
import org.plasma.sdo.core.CoreDataObject;
import org.plasma.sdo.xml.DefaultOptions;
import org.plasma.sdo.xml.StreamUnmarshaller;
import org.plasma.sdo.xml.UnmarshallerException;
import commonj.sdo.DataGraph;
import commonj.sdo.helper.XMLDocument;
/**
* An HDFS XML text file record reader that iterates over HDFS data for the current <code>TableSplit</code>,
* unmarshalling the XML as structured data graphs based on structural and XML-specific
* metadata from the underlying domain model. Data graphs may be heterogeneous and of any size or
* complexity are supplied through {@link GraphXmlInputFormat} including
* graphs where the underlying domain model contains instances of multiple inheritance.
* The unmarshalling is stream oriented and leverages the XML (StAX) parser
* based Plasma <a href="http://plasma-sdo.org/org/plasma/sdo/xml/StreamUnmarshaller.html">StreamUnmarshaller</a>.
* <p>
* Several job {@link Counters} are set up which accumulate various metrics related to the resulting graph, the
* time taken to unmarshal the XML in addition to other metrics.
* </p>
*
* This XML text file record reader is "line oriented" such that every HDFS line is assumed to
* be a single data graph marshalled as XML. Below is an example where a given data graph
* is being serialized as a single line.
*
* <pre>
* protected byte[] marshal(DataGraph graph) throws IOException {
* DefaultOptions options = new DefaultOptions(graph.getRootObject().getType().getURI());
* options.setRootNamespacePrefix("c");
* options.setPrettyPrint(false);
* XMLDocument doc = PlasmaXMLHelper.INSTANCE.createDocument(
* graph.getRootObject(),
* graph.getRootObject().getType().getURI(), null);
* doc.setXMLDeclaration(false);
* ByteArrayOutputStream os = new ByteArrayOutputStream();
* PlasmaXMLHelper.INSTANCE.save(doc, os, options);
* os.close();
* return os.toByteArray();
* }
* </pre>
*
* @see org.cloudgraph.mapreduce.GraphWritable
* @see org.cloudgraph.mapreduce.GraphXmlInputFormat
*
* @author Scott Cinnamond
* @since 0.5.8
*/
public class GraphXmlRecordReader extends
RecordReader<LongWritable, GraphWritable> {
static final Log log = LogFactory.getLog(GraphXmlRecordReader.class);
private long start;
private long pos;
private long end;
private LineReader in;
private int maxLineLength;
private LongWritable key = new LongWritable();
private GraphWritable value = null;
private Configuration configuration;
private String rootNamespaceUri;
private String rootNamespacePrefix;
private DefaultOptions unmarshalOptions;
private StreamUnmarshaller unmarshaler;
private TaskAttemptContext context;
private Method getCounter = null;
private long totalGraphNodesAssembled = 0;
private long totalGraphUnmarshalTime = 0;
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
throws IOException, InterruptedException {
// This InputSplit is a FileInputSplit
FileSplit split = (FileSplit) inputSplit;
this.context = context;
this.configuration = context.getConfiguration();
this.getCounter = Counters.retrieveGetCounterWithStringsParams(context);
this.rootNamespaceUri = configuration.get(GraphXmlInputFormat.ROOT_ELEM_NAMESPACE_URI);
this.rootNamespacePrefix = configuration.get(GraphXmlInputFormat.ROOT_ELEM_NAMESPACE_PREFIX, "ns1");
this.unmarshalOptions = new DefaultOptions(this.rootNamespaceUri);
this.unmarshalOptions.setRootNamespacePrefix(this.rootNamespacePrefix);
this.unmarshalOptions.setValidate(false);
this.unmarshalOptions.setFailOnValidationError(false);
this.unmarshaler =
new StreamUnmarshaller(this.unmarshalOptions, null);
// Retrieve configuration, and Max allowed
// bytes for a single record
this.maxLineLength = configuration.getInt(
"mapred.linerecordreader.maxlength",
Integer.MAX_VALUE);
// Split "S" is responsible for all records
// starting from "start" and "end" positions
start = split.getStart();
end = start + split.getLength();
// Retrieve file containing Split "S"
final Path file = split.getPath();
FileSystem fs = file.getFileSystem(this.configuration);
FSDataInputStream fileIn = fs.open(split.getPath());
// If Split "S" starts at byte 0, first line will be processed
// If Split "S" does not start at byte 0, first line has been already
// processed by "S-1" and therefore needs to be silently ignored
boolean skipFirstLine = false;
if (start != 0) {
skipFirstLine = true;
// Set the file pointer at "start - 1" position.
// This is to make sure we won't miss any line
// It could happen if "start" is located on a EOL
--start;
fileIn.seek(start);
}
in = new LineReader(fileIn, this.configuration);
// If first line needs to be skipped, read first line
// and stores its content to a dummy Text
if (skipFirstLine) {
Text dummy = new Text();
// Reset "start" to "start + line offset"
start += in.readLine(dummy, 0,
(int) Math.min(
(long) Integer.MAX_VALUE,
end - start));
}
// Position is the actual start
this.pos = start;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
// Current offset is the key
key.set(pos);
int newSize = 0;
// Make sure we get at least one record that starts in this Split
while (pos < end) {
Text text = new Text();
// Read first line and store its content to "value"
newSize = in.readLine(text, maxLineLength,
Math.max((int) Math.min(
Integer.MAX_VALUE, end - pos),
maxLineLength));
DataGraph graph = this.unmarshal(text);
this.value = new GraphWritable(graph);
updateCounters();
// No byte read, seems that we reached end of Split
// Break and return false (no key / value)
if (newSize == 0) {
break;
}
// Line is read, new position is set
pos += newSize;
// Line is lower than Maximum record line size
// break and return true (found key / value)
if (newSize < maxLineLength) {
break;
}
// Line is too long
// Try again with position = position + line offset,
// i.e. ignore line and go to next one
log.error("Skipped line of size " + newSize + " at pos "
+ (pos - newSize));
}
if (newSize == 0) {
// We've reached end of Split
key = null;
value = null;
return false;
} else {
// Tell Hadoop a new line has been found
// key / value will be retrieved by
// getCurrentKey getCurrentValue methods
return true;
}
}
@Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return key;
}
@Override
public GraphWritable getCurrentValue() throws IOException,
InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
if (start == end) {
return 0.0f;
} else {
return Math.min(1.0f, (pos - start) / (float) (end - start));
}
}
@Override
public void close() throws IOException {
if (in != null) {
in.close();
}
}
/**
* Updates various job counters.
* @throws IOException
*/
private void updateCounters() throws IOException {
// we can get access to counters only if hbase uses new mapreduce APIs
if (this.getCounter == null) {
return;
}
try {
((Counter) this.getCounter.invoke(context,
Counters.CLOUDGRAPH_COUNTER_GROUP_NAME, Counters.CLOUDGRAPH_COUNTER_NAME_NUM_GRAPH_NODES_ASSEMBLED))
.increment(this.totalGraphNodesAssembled);
((Counter) this.getCounter.invoke(context,
Counters.CLOUDGRAPH_COUNTER_GROUP_NAME, Counters.CLOUDGRAPH_COUNTER_NAME_TOT_GRAPH_XML_UNMARSHAL_TIME))
.increment(this.totalGraphUnmarshalTime);
} catch (Exception e) {
log.debug("can't update counter."
+ StringUtils.stringifyException(e));
}
}
/**
* Deserializes the given text/xml and unmarshalling it as a data graph, capturing various
* metrics and returning the new graph. The given text/xml represents a single line in the underlying
* HDFS file and is assumed to be an XML serialized data graph.
*
* @param text the input text
* @return the new graph
* @throws IOException
*/
private DataGraph unmarshal(Text text) throws IOException {
long before = System.currentTimeMillis();
String textString = text.toString();
ByteArrayInputStream xmlloadis = new ByteArrayInputStream(textString.getBytes("UTF-8"));
try {
this.unmarshaler.unmarshal(xmlloadis);
} catch (XMLStreamException e) {
throw new IOException(e);
} catch (UnmarshallerException e) {
throw new IOException(e);
}
XMLDocument doc = this.unmarshaler.getResult();
doc.setNoNamespaceSchemaLocation(null);
long after = System.currentTimeMillis();
CoreDataObject root = (CoreDataObject)doc.getRootObject();
MetricCollector visitor = new MetricCollector();
root.accept(visitor);
root.setValue(
CloudGraphConstants.GRAPH_ASSEMBLY_TIME,
Long.valueOf(after - before));
root.setValue(
CloudGraphConstants.GRAPH_NODE_COUNT,
Long.valueOf(visitor.getCount()));
root.setValue(
CloudGraphConstants.GRAPH_DEPTH,
Long.valueOf(visitor.getDepth()));
Long time = (Long)root.getValue(CloudGraphConstants.GRAPH_ASSEMBLY_TIME);
this.totalGraphUnmarshalTime = time.longValue();
Long nodeCount = (Long)root.getValue(CloudGraphConstants.GRAPH_NODE_COUNT);
this.totalGraphNodesAssembled = nodeCount.longValue();
return doc.getRootObject().getDataGraph();
}
}