package com.thinkaurelius.faunus.formats; import com.thinkaurelius.faunus.FaunusEdge; import com.thinkaurelius.faunus.FaunusGraph; import com.thinkaurelius.faunus.FaunusVertex; import com.thinkaurelius.faunus.Holder; import com.thinkaurelius.faunus.Tokens; import com.thinkaurelius.faunus.formats.titan.GraphFactory; import com.thinkaurelius.faunus.formats.titan.TitanOutputFormat; import com.thinkaurelius.faunus.mapreduce.util.EmptyConfiguration; import com.tinkerpop.blueprints.Edge; import com.tinkerpop.blueprints.Graph; import com.tinkerpop.blueprints.TransactionalGraph; import com.tinkerpop.blueprints.Vertex; import com.tinkerpop.gremlin.groovy.jsr223.GremlinGroovyScriptEngine; import groovy.lang.MissingMethodException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.Reducer; import org.apache.log4j.Level; import org.apache.log4j.Logger; import javax.script.Bindings; import javax.script.ScriptException; import java.io.IOException; import java.io.InputStreamReader; import java.util.HashMap; import static com.tinkerpop.blueprints.Direction.IN; import static com.tinkerpop.blueprints.Direction.OUT; /** * BlueprintsGraphOutputMapReduce will write a [NullWritable, FaunusVertex] stream to a Blueprints-enabled graph. * This is useful for bulk loading a Faunus graph into a Blueprints graph. * Graph writing happens in three distinction phase. * During the first Map phase, all the vertices of the graph are written. * During the first Reduce phase, an id-to-id distributed map of the adjacency pairs is serialized. * During the second Map phase, all the edges of the graph are written. * Each write stage is embarrassingly parallel with reduce communication only used to communicate generated vertex ids. * The output of the final Map phase is a degenerate graph and is not considered viable for consumption. * * @author Marko A. Rodriguez (http://markorodriguez.com) */ public class BlueprintsGraphOutputMapReduce { public enum Counters { VERTICES_RETRIEVED, VERTICES_WRITTEN, VERTEX_PROPERTIES_WRITTEN, EDGES_WRITTEN, EDGE_PROPERTIES_WRITTEN, NULL_VERTEX_EDGES_IGNORED, NULL_VERTICES_IGNORED, SUCCESSFUL_TRANSACTIONS, FAILED_TRANSACTIONS } private static final String GET_OR_CREATE_VERTEX = "getOrCreateVertex(faunusVertex,graph,mapContext)"; private static final String GET_OR_CREATE_EDGE = "getOrCreateEdge(faunusEdge,blueprintsOutVertex,blueprintsInVertex,graph,mapContext)"; private static final String FAUNUS_VERTEX = "faunusVertex"; private static final String FAUNUS_EDGE = "faunusEdge"; private static final String BLUEPRINTS_OUT_VERTEX = "blueprintsOutVertex"; private static final String BLUEPRINTS_IN_VERTEX = "blueprintsInVertex"; private static final String GRAPH = "graph"; private static final String MAP_CONTEXT = "mapContext"; public static final String FAUNUS_GRAPH_OUTPUT_BLUEPRINTS_SCRIPT_FILE = "faunus.graph.output.blueprints.script-file"; public static final Logger LOGGER = Logger.getLogger(BlueprintsGraphOutputMapReduce.class); // some random properties that will 'never' be used by anyone public static final String BLUEPRINTS_ID = "_bId0192834"; public static final String ID_MAP_KEY = "_iDMaPKeY"; public static Graph generateGraph(final Configuration config) { final Class<? extends OutputFormat> format = config.getClass(FaunusGraph.FAUNUS_GRAPH_OUTPUT_FORMAT, OutputFormat.class, OutputFormat.class); if (TitanOutputFormat.class.isAssignableFrom(format)) { return GraphFactory.generateGraph(config, TitanOutputFormat.FAUNUS_GRAPH_OUTPUT_TITAN); } else { // TODO: this is where Rexster can come into play here throw new RuntimeException("The provide graph output format is not supported: " + format.getName()); } } public static Configuration createConfiguration() { final Configuration configuration = new EmptyConfiguration(); configuration.setBoolean("mapred.map.tasks.speculative.execution", false); configuration.setBoolean("mapred.reduce.tasks.speculative.execution", false); return configuration; } ////////////// MAP/REDUCE WORK FROM HERE ON OUT // WRITE ALL THE VERTICES AND THEIR PROPERTIES public static class VertexMap extends Mapper<NullWritable, FaunusVertex, LongWritable, Holder<FaunusVertex>> { static GremlinGroovyScriptEngine engine = null; static boolean firstRead = true; Graph graph; private final Holder<FaunusVertex> vertexHolder = new Holder<FaunusVertex>(); private final LongWritable longWritable = new LongWritable(); private final FaunusVertex shellVertex = new FaunusVertex(); @Override public void setup(final Mapper.Context context) throws IOException, InterruptedException { this.graph = BlueprintsGraphOutputMapReduce.generateGraph(context.getConfiguration()); final String file = context.getConfiguration().get(FAUNUS_GRAPH_OUTPUT_BLUEPRINTS_SCRIPT_FILE, null); if (null != file && firstRead) { final FileSystem fs = FileSystem.get(context.getConfiguration()); try { engine = new GremlinGroovyScriptEngine(); engine.eval(new InputStreamReader(fs.open(new Path(file)))); try { engine.eval("getOrCreateVertex(null,null,null)"); } catch (ScriptException se) { if (se.getCause().getCause() instanceof MissingMethodException) engine = null; } } catch (Exception e) { throw new IOException(e.getMessage()); } firstRead = false; } LOGGER.setLevel(Level.INFO); } @Override public void map(final NullWritable key, final FaunusVertex value, final Mapper<NullWritable, FaunusVertex, LongWritable, Holder<FaunusVertex>>.Context context) throws IOException, InterruptedException { try { // Read (and/or Write) FaunusVertex (and respective properties) to Blueprints Graph // Attempt to use the ID provided by Faunus final Vertex blueprintsVertex = this.getOrCreateVertex(value, context); // Propagate shell vertices with Blueprints ids this.shellVertex.reuse(value.getIdAsLong()); this.shellVertex.setProperty(BLUEPRINTS_ID, blueprintsVertex.getId()); // TODO: Might need to be OUT for the sake of unidirectional edges in Titan for (final Edge faunusEdge : value.getEdges(IN)) { this.longWritable.set((Long) faunusEdge.getVertex(OUT).getId()); context.write(this.longWritable, this.vertexHolder.set('s', this.shellVertex)); } this.longWritable.set(value.getIdAsLong()); value.getProperties().clear(); // no longer needed in reduce phase value.setProperty(BLUEPRINTS_ID, blueprintsVertex.getId()); // need this for id resolution in reduce phase value.removeEdges(Tokens.Action.DROP, IN); // no longer needed in second map phase context.write(this.longWritable, this.vertexHolder.set('v', value)); } catch (final Exception e) { if (this.graph instanceof TransactionalGraph) { ((TransactionalGraph) this.graph).rollback(); context.getCounter(Counters.FAILED_TRANSACTIONS).increment(1l); } throw new IOException(e.getMessage(), e); } } @Override public void cleanup(final Mapper<NullWritable, FaunusVertex, LongWritable, Holder<FaunusVertex>>.Context context) throws IOException, InterruptedException { if (this.graph instanceof TransactionalGraph) { try { ((TransactionalGraph) this.graph).commit(); context.getCounter(Counters.SUCCESSFUL_TRANSACTIONS).increment(1l); } catch (Exception e) { LOGGER.error("Could not commit transaction during VertexMap.cleanup():", e); ((TransactionalGraph) this.graph).rollback(); context.getCounter(Counters.FAILED_TRANSACTIONS).increment(1l); throw new IOException(e.getMessage(), e); } } this.graph.shutdown(); } public Vertex getOrCreateVertex(final FaunusVertex faunusVertex, final Mapper<NullWritable, FaunusVertex, LongWritable, Holder<FaunusVertex>>.Context context) throws InterruptedException { final Vertex blueprintsVertex; if (null == engine) { blueprintsVertex = this.graph.addVertex(faunusVertex.getIdAsLong()); context.getCounter(Counters.VERTICES_WRITTEN).increment(1l); for (final String property : faunusVertex.getPropertyKeys()) { blueprintsVertex.setProperty(property, faunusVertex.getProperty(property)); context.getCounter(Counters.VERTEX_PROPERTIES_WRITTEN).increment(1l); } } else { try { final Bindings bindings = engine.createBindings(); bindings.put(FAUNUS_VERTEX, faunusVertex); bindings.put(GRAPH, this.graph); bindings.put(MAP_CONTEXT, context); blueprintsVertex = (Vertex) engine.eval(GET_OR_CREATE_VERTEX, bindings); } catch (Exception e) { throw new InterruptedException(e.getMessage()); } } return blueprintsVertex; } } public static class Reduce extends Reducer<LongWritable, Holder<FaunusVertex>, NullWritable, FaunusVertex> { @Override public void reduce(final LongWritable key, final Iterable<Holder<FaunusVertex>> values, final Reducer<LongWritable, Holder<FaunusVertex>, NullWritable, FaunusVertex>.Context context) throws IOException, InterruptedException { FaunusVertex faunusVertex = null; // generate a map of the faunus id with the blueprints id for all shell vertices (vertices incoming adjacent) final java.util.Map<Long, Object> faunusBlueprintsIdMap = new HashMap<Long, Object>(); for (final Holder<FaunusVertex> holder : values) { if (holder.getTag() == 's') { faunusBlueprintsIdMap.put(holder.get().getIdAsLong(), holder.get().getProperty(BLUEPRINTS_ID)); } else { final FaunusVertex toClone = holder.get(); faunusVertex = new FaunusVertex(toClone.getIdAsLong()); faunusVertex.setProperty(BLUEPRINTS_ID, toClone.getProperty(BLUEPRINTS_ID)); faunusVertex.addEdges(OUT, toClone); } } if (null != faunusVertex) { faunusVertex.setProperty(ID_MAP_KEY, faunusBlueprintsIdMap); context.write(NullWritable.get(), faunusVertex); } else { LOGGER.warn("No source vertex: faunusVertex[" + key.get() + "]"); context.getCounter(Counters.NULL_VERTICES_IGNORED).increment(1l); } } } // WRITE ALL THE EDGES CONNECTING THE VERTICES public static class EdgeMap extends Mapper<NullWritable, FaunusVertex, NullWritable, FaunusVertex> { static GremlinGroovyScriptEngine engine = null; static boolean firstRead = true; Graph graph; private static final FaunusVertex DEAD_FAUNUS_VERTEX = new FaunusVertex(); @Override public void setup(final Mapper.Context context) throws IOException, InterruptedException { this.graph = BlueprintsGraphOutputMapReduce.generateGraph(context.getConfiguration()); final String file = context.getConfiguration().get(FAUNUS_GRAPH_OUTPUT_BLUEPRINTS_SCRIPT_FILE, null); if (null != file && firstRead) { final FileSystem fs = FileSystem.get(context.getConfiguration()); try { engine = new GremlinGroovyScriptEngine(); engine.eval(new InputStreamReader(fs.open(new Path(file)))); try { engine.eval("getOrCreateEdge(null,null,null,null,null)"); } catch (ScriptException se) { if (se.getCause().getCause() instanceof MissingMethodException) engine = null; } } catch (Exception e) { throw new IOException(e.getMessage()); } firstRead = false; } LOGGER.setLevel(Level.INFO); } @Override public void map(final NullWritable key, final FaunusVertex value, final Mapper<NullWritable, FaunusVertex, NullWritable, FaunusVertex>.Context context) throws IOException, InterruptedException { try { final java.util.Map<Long, Object> faunusBlueprintsIdMap = value.getProperty(ID_MAP_KEY); final Object blueprintsId = value.getProperty(BLUEPRINTS_ID); Vertex blueprintsVertex = null; if (null != blueprintsId) blueprintsVertex = this.graph.getVertex(blueprintsId); // this means that an adjacent vertex to this vertex wasn't created if (null != blueprintsVertex) { for (final Edge faunusEdge : value.getEdges(OUT)) { final Object otherId = faunusBlueprintsIdMap.get(faunusEdge.getVertex(IN).getId()); Vertex otherVertex = null; if (null != otherId) otherVertex = this.graph.getVertex(otherId); if (null != otherVertex) { this.getOrCreateEdge((FaunusEdge) faunusEdge, blueprintsVertex, otherVertex, context); } else { LOGGER.warn("No target vertex: faunusVertex[" + faunusEdge.getVertex(IN).getId() + "] blueprintsVertex[" + otherId + "]"); context.getCounter(Counters.NULL_VERTEX_EDGES_IGNORED).increment(1l); } } } else { LOGGER.warn("No source vertex: faunusVertex[" + NullWritable.get() + "] blueprintsVertex[" + blueprintsId + "]"); context.getCounter(Counters.NULL_VERTICES_IGNORED).increment(1l); } // the emitted vertex is not complete -- assuming this is the end of the stage and vertex is dead context.write(NullWritable.get(), DEAD_FAUNUS_VERTEX); } catch (final Exception e) { if (this.graph instanceof TransactionalGraph) { ((TransactionalGraph) this.graph).rollback(); context.getCounter(Counters.FAILED_TRANSACTIONS).increment(1l); } throw new IOException(e.getMessage(), e); } } @Override public void cleanup(final Mapper<NullWritable, FaunusVertex, NullWritable, FaunusVertex>.Context context) throws IOException, InterruptedException { if (this.graph instanceof TransactionalGraph) { try { ((TransactionalGraph) this.graph).commit(); context.getCounter(Counters.SUCCESSFUL_TRANSACTIONS).increment(1l); } catch (Exception e) { LOGGER.error("Could not commit transaction during EdgeMap.cleanup():", e); ((TransactionalGraph) this.graph).rollback(); context.getCounter(Counters.FAILED_TRANSACTIONS).increment(1l); throw new IOException(e.getMessage(), e); } } this.graph.shutdown(); } public Edge getOrCreateEdge(final FaunusEdge faunusEdge, final Vertex blueprintsOutVertex, final Vertex blueprintsInVertex, final Mapper<NullWritable, FaunusVertex, NullWritable, FaunusVertex>.Context context) throws InterruptedException { final Edge blueprintsEdge; if (null == engine) { blueprintsEdge = this.graph.addEdge(null, blueprintsOutVertex, blueprintsInVertex, faunusEdge.getLabel()); context.getCounter(Counters.EDGES_WRITTEN).increment(1l); for (final String property : faunusEdge.getPropertyKeys()) { blueprintsEdge.setProperty(property, faunusEdge.getProperty(property)); context.getCounter(Counters.EDGE_PROPERTIES_WRITTEN).increment(1l); } } else { try { final Bindings bindings = engine.createBindings(); bindings.put(FAUNUS_EDGE, faunusEdge); bindings.put(BLUEPRINTS_OUT_VERTEX, blueprintsOutVertex); bindings.put(BLUEPRINTS_IN_VERTEX, blueprintsInVertex); bindings.put(GRAPH, this.graph); bindings.put(MAP_CONTEXT, context); blueprintsEdge = (Edge) engine.eval(GET_OR_CREATE_EDGE, bindings); } catch (Exception e) { throw new InterruptedException(e.getMessage()); } } return blueprintsEdge; } } }