GraphXmlRecordReader.java example

Explorer
cloudgraph-master
/**
 *        CloudGraph Community Edition (CE) License
 * 
 * This is a community release of CloudGraph, a dual-license suite of
 * Service Data Object (SDO) 2.1 services designed for relational and 
 * big-table style "cloud" databases, such as HBase and others. 
 * This particular copy of the software is released under the 
 * version 2 of the GNU General Public License. CloudGraph was developed by 
 * TerraMeta Software, Inc.
 * 
 * Copyright (c) 2013, TerraMeta Software, Inc. All rights reserved.
 * 
 * General License information can be found below.
 * 
 * This distribution may include materials developed by third
 * parties. For license and attribution notices for these
 * materials, please refer to the documentation that accompanies
 * this distribution (see the "Licenses for Third-Party Components"
 * appendix) or view the online documentation at 
 * <http://cloudgraph.org/licenses/>. 
 */
package org.cloudgraph.mapreduce;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.lang.reflect.Method;

import javax.xml.stream.XMLStreamException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.StringUtils;
import org.cloudgraph.common.CloudGraphConstants;
import org.cloudgraph.store.service.MetricCollector;
import org.plasma.sdo.core.CoreDataObject;
import org.plasma.sdo.xml.DefaultOptions;
import org.plasma.sdo.xml.StreamUnmarshaller;
import org.plasma.sdo.xml.UnmarshallerException;

import commonj.sdo.DataGraph;
import commonj.sdo.helper.XMLDocument;

/**
 * An HDFS XML text file record reader that iterates over HDFS data for the current <code>TableSplit</code>, 
 * unmarshalling the XML as structured data graphs based on structural and XML-specific 
 * metadata from the underlying domain model. Data graphs may be heterogeneous and of any size or 
 * complexity are supplied through {@link GraphXmlInputFormat} including 
 * graphs where the underlying domain model contains instances of multiple inheritance. 
 * The unmarshalling is stream oriented and leverages the XML (StAX) parser 
 * based Plasma <a href="http://plasma-sdo.org/org/plasma/sdo/xml/StreamUnmarshaller.html">StreamUnmarshaller</a>.
 * <p>
 * Several job {@link Counters} are set up which accumulate various metrics related to the resulting graph, the
 * time taken to unmarshal the XML in addition to other metrics. 
 * </p>
 *  
 * This XML text file record reader is "line oriented" such that every HDFS line is assumed to
 * be a single data graph marshalled as XML. Below is an example where a given data graph
 * is being serialized as a single line. 
 * 
 * <pre>
 * 	protected byte[] marshal(DataGraph graph) throws IOException {
 * 		DefaultOptions options = new DefaultOptions(graph.getRootObject().getType().getURI());
 * 		options.setRootNamespacePrefix("c");
 * 		options.setPrettyPrint(false);  
 * 		XMLDocument doc = PlasmaXMLHelper.INSTANCE.createDocument(
 * 				graph.getRootObject(),
 * 				graph.getRootObject().getType().getURI(), null);
 * 		doc.setXMLDeclaration(false); 
 * 		ByteArrayOutputStream os = new ByteArrayOutputStream();
 * 		PlasmaXMLHelper.INSTANCE.save(doc, os, options);
 * 		os.close();
 * 		return os.toByteArray();
 * 	}   
 * </pre>  
 * 
 * @see org.cloudgraph.mapreduce.GraphWritable
 * @see org.cloudgraph.mapreduce.GraphXmlInputFormat
 * 
 * @author Scott Cinnamond
 * @since 0.5.8
 */
public class GraphXmlRecordReader extends
    RecordReader<LongWritable, GraphWritable> {

	static final Log log = LogFactory.getLog(GraphXmlRecordReader.class);

	private long start;
    private long pos;
    private long end;
    private LineReader in;
    private int maxLineLength;
    private LongWritable key = new LongWritable();
  	private GraphWritable value = null;
  	private Configuration configuration;
  	private String rootNamespaceUri;
  	private String rootNamespacePrefix;
  	private DefaultOptions unmarshalOptions; 
  	private StreamUnmarshaller unmarshaler;
  	private TaskAttemptContext context;
  	private Method getCounter = null;
	private long totalGraphNodesAssembled = 0;
	private long totalGraphUnmarshalTime = 0;
	
	@Override
	public void initialize(InputSplit inputSplit, TaskAttemptContext context)
			throws IOException, InterruptedException {
	       // This InputSplit is a FileInputSplit
        FileSplit split = (FileSplit) inputSplit;
 
        this.context = context;
        this.configuration = context.getConfiguration();
        this.getCounter = Counters.retrieveGetCounterWithStringsParams(context);
       
        this.rootNamespaceUri = configuration.get(GraphXmlInputFormat.ROOT_ELEM_NAMESPACE_URI);
        this.rootNamespacePrefix = configuration.get(GraphXmlInputFormat.ROOT_ELEM_NAMESPACE_PREFIX, "ns1");
        
        this.unmarshalOptions = new DefaultOptions(this.rootNamespaceUri);
        this.unmarshalOptions.setRootNamespacePrefix(this.rootNamespacePrefix);
        this.unmarshalOptions.setValidate(false);
        this.unmarshalOptions.setFailOnValidationError(false);
        this.unmarshaler = 
        		new StreamUnmarshaller(this.unmarshalOptions, null);       
        
        // Retrieve configuration, and Max allowed
        // bytes for a single record
        this.maxLineLength = configuration.getInt(
                "mapred.linerecordreader.maxlength",
                Integer.MAX_VALUE);
 
        // Split "S" is responsible for all records
        // starting from "start" and "end" positions
        start = split.getStart();
        end = start + split.getLength();
 
        // Retrieve file containing Split "S"
        final Path file = split.getPath();
        FileSystem fs = file.getFileSystem(this.configuration);
        FSDataInputStream fileIn = fs.open(split.getPath());
 
        // If Split "S" starts at byte 0, first line will be processed
        // If Split "S" does not start at byte 0, first line has been already
        // processed by "S-1" and therefore needs to be silently ignored
        boolean skipFirstLine = false;
        if (start != 0) {
            skipFirstLine = true;
            // Set the file pointer at "start - 1" position.
            // This is to make sure we won't miss any line
            // It could happen if "start" is located on a EOL
            --start;
            fileIn.seek(start);
        }
 
        in = new LineReader(fileIn, this.configuration);
 
        // If first line needs to be skipped, read first line
        // and stores its content to a dummy Text
        if (skipFirstLine) {
            Text dummy = new Text();
            // Reset "start" to "start + line offset"
            start += in.readLine(dummy, 0,
                    (int) Math.min(
                            (long) Integer.MAX_VALUE, 
                            end - start));
        }
 
        // Position is the actual start
        this.pos = start;
 	}

	@Override
	public boolean nextKeyValue() throws IOException, InterruptedException {
        // Current offset is the key
        key.set(pos);
 
        int newSize = 0;
 
        // Make sure we get at least one record that starts in this Split
        while (pos < end) {
 
        	Text text = new Text();
            // Read first line and store its content to "value"
            newSize = in.readLine(text, maxLineLength,
                    Math.max((int) Math.min(
                            Integer.MAX_VALUE, end - pos),
                            maxLineLength));
            
            DataGraph graph = this.unmarshal(text);
            this.value = new GraphWritable(graph);
            updateCounters();
 
            // No byte read, seems that we reached end of Split
            // Break and return false (no key / value)
            if (newSize == 0) {
                break;
            }
 
            // Line is read, new position is set
            pos += newSize;
 
            // Line is lower than Maximum record line size
            // break and return true (found key / value)
            if (newSize < maxLineLength) {
                break;
            }
 
            // Line is too long
            // Try again with position = position + line offset,
            // i.e. ignore line and go to next one
            log.error("Skipped line of size " + newSize + " at pos "
                    + (pos - newSize));
        }
 
         
        if (newSize == 0) {
            // We've reached end of Split
            key = null;
            value = null;
            return false;
        } else {
            // Tell Hadoop a new line has been found
            // key / value will be retrieved by
            // getCurrentKey getCurrentValue methods
            return true;
        }
        
    }

	@Override
	public LongWritable getCurrentKey() throws IOException,
			InterruptedException {
        return key;
	}

	@Override
	public GraphWritable getCurrentValue() throws IOException,
			InterruptedException {
        return value;
	}

	@Override
	public float getProgress() throws IOException, InterruptedException {
        if (start == end) {
            return 0.0f;
        } else {
            return Math.min(1.0f, (pos - start) / (float) (end - start));
        }
	}

	@Override
	public void close() throws IOException {
        if (in != null) {
            in.close();
        }
	}
	
	/**
	 * Updates various job counters.
	 * @throws IOException
	 */
	private void updateCounters() throws IOException {
		// we can get access to counters only if hbase uses new mapreduce APIs
		if (this.getCounter == null) {
			return;
		}
		
		try {
		    ((Counter) this.getCounter.invoke(context,
				Counters.CLOUDGRAPH_COUNTER_GROUP_NAME, Counters.CLOUDGRAPH_COUNTER_NAME_NUM_GRAPH_NODES_ASSEMBLED))
				.increment(this.totalGraphNodesAssembled);
		    
		    ((Counter) this.getCounter.invoke(context,
		    		Counters.CLOUDGRAPH_COUNTER_GROUP_NAME, Counters.CLOUDGRAPH_COUNTER_NAME_TOT_GRAPH_XML_UNMARSHAL_TIME))
				.increment(this.totalGraphUnmarshalTime);		    
		    
		} catch (Exception e) {
			log.debug("can't update counter."
					+ StringUtils.stringifyException(e));
		}
	}
	
	/**
	 * Deserializes the given text/xml and unmarshalling it as a data graph, capturing various
	 * metrics and returning the new graph. The given text/xml represents a single line in the underlying
	 * HDFS file and is assumed to be an XML serialized data graph. 
	 * 
	 * @param text  the input text
	 * @return the new graph
	 * @throws IOException
	 */
	private DataGraph unmarshal(Text text) throws IOException {

    	long before = System.currentTimeMillis();
    	 
    	String textString = text.toString(); 
     	ByteArrayInputStream xmlloadis = new ByteArrayInputStream(textString.getBytes("UTF-8"));
     	try {
			this.unmarshaler.unmarshal(xmlloadis);
		} catch (XMLStreamException e) {
			throw new IOException(e);
		} catch (UnmarshallerException e) {
			throw new IOException(e);
		}
     	XMLDocument doc = this.unmarshaler.getResult();
     	doc.setNoNamespaceSchemaLocation(null);
		
    	long after = System.currentTimeMillis();
    	
        CoreDataObject root = (CoreDataObject)doc.getRootObject();
        MetricCollector visitor = new MetricCollector();
    	root.accept(visitor);
    	
        root.setValue(
    		CloudGraphConstants.GRAPH_ASSEMBLY_TIME,
    		Long.valueOf(after - before));    	
   	    root.setValue(
        		CloudGraphConstants.GRAPH_NODE_COUNT,
        		Long.valueOf(visitor.getCount()));
    	root.setValue(
        		CloudGraphConstants.GRAPH_DEPTH,
        		Long.valueOf(visitor.getDepth()));
       
		Long time = (Long)root.getValue(CloudGraphConstants.GRAPH_ASSEMBLY_TIME);
		this.totalGraphUnmarshalTime = time.longValue();
		Long nodeCount = (Long)root.getValue(CloudGraphConstants.GRAPH_NODE_COUNT);
		this.totalGraphNodesAssembled = nodeCount.longValue();		
		
        return doc.getRootObject().getDataGraph();
	}
	

}