/*********************************************************************************************************************** * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. **********************************************************************************************************************/ package eu.stratosphere.spargel.java; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.commons.lang3.Validate; import eu.stratosphere.api.common.aggregators.Aggregator; import eu.stratosphere.api.java.DataSet; import eu.stratosphere.api.java.DeltaIteration; import eu.stratosphere.api.java.functions.CoGroupFunction; import eu.stratosphere.api.java.operators.CoGroupOperator; import eu.stratosphere.api.java.operators.CustomUnaryOperation; import eu.stratosphere.api.java.tuple.Tuple2; import eu.stratosphere.api.java.tuple.Tuple3; import eu.stratosphere.api.java.typeutils.ResultTypeQueryable; import eu.stratosphere.api.java.typeutils.TupleTypeInfo; import eu.stratosphere.api.java.typeutils.TypeExtractor; import eu.stratosphere.configuration.Configuration; import eu.stratosphere.types.TypeInformation; import eu.stratosphere.util.Collector; /** * This class represents iterative graph computations, programmed in a vertex-centric perspective. * It is a special case of <i>Bulk Synchronous Parallel<i> computation. The paradigm has also been * implemented by Google's <i>Pregel</i> system and by <i>Apache Giraph</i>. * <p> * Vertex centric algorithms operate on graphs, which are defined through vertices and edges. The * algorithms send messages along the edges and update the state of vertices based on * the old state and the incoming messages. All vertices have an initial state. * The computation terminates once no vertex updates it state any more. * Additionally, a maximum number of iterations (supersteps) may be specified. * <p> * The computation is here represented by two functions: * <ul> * <li>The {@link VertexUpdateFunction} receives incoming messages and may updates the state for * the vertex. If a state is updated, messages are sent from this vertex. Initially, all vertices are * considered updated.</li> * <li>The {@link MessagingFunction} takes the new vertex state and sends messages along the outgoing * edges of the vertex. The outgoing edges may optionally have an associated value, such as a weight.</li> * </ul> * <p> * Vertex-centric graph iterations are instantiated by the * {@link #withPlainEdges(DataSet, VertexUpdateFunction, MessagingFunction, int)} method, or the * {@link #withValuedEdges(DataSet, VertexUpdateFunction, MessagingFunction, int)} method, depending on whether * the graph's edges are carrying values. * * @param <VertexKey> The type of the vertex key (the vertex identifier). * @param <VertexValue> The type of the vertex value (the state of the vertex). * @param <Message> The type of the message sent between vertices along the edges. * @param <EdgeValue> The type of the values that are associated with the edges. */ public class VertexCentricIteration<VertexKey extends Comparable<VertexKey>, VertexValue, Message, EdgeValue> implements CustomUnaryOperation<Tuple2<VertexKey, VertexValue>, Tuple2<VertexKey, VertexValue>> { private final VertexUpdateFunction<VertexKey, VertexValue, Message> updateFunction; private final MessagingFunction<VertexKey, VertexValue, Message, EdgeValue> messagingFunction; private final DataSet<Tuple2<VertexKey, VertexKey>> edgesWithoutValue; private final DataSet<Tuple3<VertexKey, VertexKey, EdgeValue>> edgesWithValue; private final Map<String, Class<? extends Aggregator<?>>> aggregators; private final int maximumNumberOfIterations; private final List<Tuple2<String, DataSet<?>>> bcVarsUpdate = new ArrayList<Tuple2<String,DataSet<?>>>(4); private final List<Tuple2<String, DataSet<?>>> bcVarsMessaging = new ArrayList<Tuple2<String,DataSet<?>>>(4); private final TypeInformation<Message> messageType; private DataSet<Tuple2<VertexKey, VertexValue>> initialVertices; private String name; private int parallelism = -1; // ---------------------------------------------------------------------------------- private VertexCentricIteration(VertexUpdateFunction<VertexKey, VertexValue, Message> uf, MessagingFunction<VertexKey, VertexValue, Message, EdgeValue> mf, DataSet<Tuple2<VertexKey, VertexKey>> edgesWithoutValue, int maximumNumberOfIterations) { Validate.notNull(uf); Validate.notNull(mf); Validate.notNull(edgesWithoutValue); Validate.isTrue(maximumNumberOfIterations > 0, "The maximum number of iterations must be at least one."); // check that the edges are actually a valid tuple set of vertex key types TypeInformation<Tuple2<VertexKey, VertexKey>> edgesType = edgesWithoutValue.getType(); Validate.isTrue(edgesType.isTupleType() && edgesType.getArity() == 2, "The edges data set (for edges without edge values) must consist of 2-tuples."); TupleTypeInfo<?> tupleInfo = (TupleTypeInfo<?>) edgesType; Validate.isTrue(tupleInfo.getTypeAt(0).equals(tupleInfo.getTypeAt(1)) && Comparable.class.isAssignableFrom(tupleInfo.getTypeAt(0).getTypeClass()), "Both tuple fields (source and target vertex id) must be of the data type that represents the vertex key and implement the java.lang.Comparable interface."); this.updateFunction = uf; this.messagingFunction = mf; this.edgesWithoutValue = edgesWithoutValue; this.edgesWithValue = null; this.maximumNumberOfIterations = maximumNumberOfIterations; this.aggregators = new HashMap<String, Class<? extends Aggregator<?>>>(); this.messageType = getMessageType(mf); } private VertexCentricIteration(VertexUpdateFunction<VertexKey, VertexValue, Message> uf, MessagingFunction<VertexKey, VertexValue, Message, EdgeValue> mf, DataSet<Tuple3<VertexKey, VertexKey, EdgeValue>> edgesWithValue, int maximumNumberOfIterations, boolean edgeHasValueMarker) { Validate.notNull(uf); Validate.notNull(mf); Validate.notNull(edgesWithValue); Validate.isTrue(maximumNumberOfIterations > 0, "The maximum number of iterations must be at least one."); // check that the edges are actually a valid tuple set of vertex key types TypeInformation<Tuple3<VertexKey, VertexKey, EdgeValue>> edgesType = edgesWithValue.getType(); Validate.isTrue(edgesType.isTupleType() && edgesType.getArity() == 3, "The edges data set (for edges with edge values) must consist of 3-tuples."); TupleTypeInfo<?> tupleInfo = (TupleTypeInfo<?>) edgesType; Validate.isTrue(tupleInfo.getTypeAt(0).equals(tupleInfo.getTypeAt(1)) && Comparable.class.isAssignableFrom(tupleInfo.getTypeAt(0).getTypeClass()), "The first two tuple fields (source and target vertex id) must be of the data type that represents the vertex key and implement the java.lang.Comparable interface."); Validate.isTrue(maximumNumberOfIterations > 0, "The maximum number of iterations must be at least one."); this.updateFunction = uf; this.messagingFunction = mf; this.edgesWithoutValue = null; this.edgesWithValue = edgesWithValue; this.maximumNumberOfIterations = maximumNumberOfIterations; this.aggregators = new HashMap<String, Class<? extends Aggregator<?>>>(); this.messageType = getMessageType(mf); } private TypeInformation<Message> getMessageType(MessagingFunction<VertexKey, VertexValue, Message, EdgeValue> mf) { return TypeExtractor.createTypeInfo(MessagingFunction.class, mf.getClass(), 2, null, null); } /** * Registers a new aggregator. Aggregators registered here are available during the execution of the vertex updates * via {@link VertexUpdateFunction#getIterationAggregator(String)} and * {@link VertexUpdateFunction#getPreviousIterationAggregate(String)}. * * @param name The name of the aggregator, used to retrieve it and its aggregates during execution. * @param aggregator The aggregator. */ public void registerAggregator(String name, Class<? extends Aggregator<?>> aggregator) { this.aggregators.put(name, aggregator); } /** * Adds a data set as a broadcast set to the messaging function. * * @param name The name under which the broadcast data is available in the messaging function. * @param data The data set to be broadcasted. */ public void addBroadcastSetForMessagingFunction(String name, DataSet<?> data) { this.bcVarsMessaging.add(new Tuple2<String, DataSet<?>>(name, data)); } /** * Adds a data set as a broadcast set to the vertex update function. * * @param name The name under which the broadcast data is available in the vertex update function. * @param data The data set to be broadcasted. */ public void addBroadcastSetForUpdateFunction(String name, DataSet<?> data) { this.bcVarsUpdate.add(new Tuple2<String, DataSet<?>>(name, data)); } /** * Sets the name for the vertex-centric iteration. The name is displayed in logs and messages. * * @param name The name for the iteration. */ public void setName(String name) { this.name = name; } /** * Gets the name from this vertex-centric iteration. * * @return The name of the iteration. */ public String getName() { return name; } /** * Sets the degree of parallelism for the iteration. * * @param parallelism The degree of parallelism. */ public void setParallelism(int parallelism) { Validate.isTrue(parallelism > 0 || parallelism == -1, "The degree of parallelism must be positive, or -1 (use default)."); this.parallelism = parallelism; } /** * Gets the iteration's degree of parallelism. * * @return The iterations parallelism, or -1, if not set. */ public int getParallelism() { return parallelism; } // -------------------------------------------------------------------------------------------- // Custom Operator behavior // -------------------------------------------------------------------------------------------- /** * Sets the input data set for this operator. In the case of this operator this input data set represents * the set of vertices with their initial state. * * @param inputData The input data set, which in the case of this operator represents the set of * vertices with their initial state. * * @see eu.stratosphere.api.java.operators.CustomUnaryOperation#setInput(eu.stratosphere.api.java.DataSet) */ @Override public void setInput(DataSet<Tuple2<VertexKey, VertexValue>> inputData) { // sanity check that we really have two tuples TypeInformation<Tuple2<VertexKey, VertexValue>> inputType = inputData.getType(); Validate.isTrue(inputType.isTupleType() && inputType.getArity() == 2, "The input data set (the initial vertices) must consist of 2-tuples."); // check that the key type here is the same as for the edges TypeInformation<VertexKey> keyType = ((TupleTypeInfo<?>) inputType).getTypeAt(0); TypeInformation<?> edgeType = edgesWithoutValue != null ? edgesWithoutValue.getType() : edgesWithValue.getType(); TypeInformation<VertexKey> edgeKeyType = ((TupleTypeInfo<?>) edgeType).getTypeAt(0); Validate.isTrue(keyType.equals(edgeKeyType), "The first tuple field (the vertex id) of the input data set (the initial vertices) " + "must be the same data type as the first fields of the edge data set (the source vertex id). " + "Here, the key type for the vertex ids is '%s' and the key type for the edges is '%s'.", keyType, edgeKeyType); this.initialVertices = inputData; } /** * Creates the operator that represents this vertex-centric graph computation. * * @return The operator that represents this vertex-centric graph computation. */ @Override public DataSet<Tuple2<VertexKey, VertexValue>> createResult() { if (this.initialVertices == null) { throw new IllegalStateException("The input data set has not been set."); } // prepare some type information TypeInformation<Tuple2<VertexKey, VertexValue>> vertexTypes = initialVertices.getType(); TypeInformation<VertexKey> keyType = ((TupleTypeInfo<?>) initialVertices.getType()).getTypeAt(0); TypeInformation<Tuple2<VertexKey, Message>> messageTypeInfo = new TupleTypeInfo<Tuple2<VertexKey,Message>>(keyType, messageType); // set up the iteration operator final String name = (this.name != null) ? this.name : "Vertex-centric iteration (" + updateFunction + " | " + messagingFunction + ")"; final int[] zeroKeyPos = new int[] {0}; final DeltaIteration<Tuple2<VertexKey, VertexValue>, Tuple2<VertexKey, VertexValue>> iteration = this.initialVertices.iterateDelta(this.initialVertices, this.maximumNumberOfIterations, zeroKeyPos); iteration.name(name); iteration.parallelism(parallelism); // register all aggregators for (Map.Entry<String, Class<? extends Aggregator<?>>> entry : this.aggregators.entrySet()) { iteration.registerAggregator(entry.getKey(), entry.getValue()); } // build the messaging function (co group) CoGroupOperator<?, ?, Tuple2<VertexKey, Message>> messages; if (edgesWithoutValue != null) { MessagingUdfNoEdgeValues<VertexKey, VertexValue, Message> messenger = new MessagingUdfNoEdgeValues<VertexKey, VertexValue, Message>(messagingFunction, messageTypeInfo); messages = this.edgesWithoutValue.coGroup(iteration.getWorkset()).where(0).equalTo(0).with(messenger); } else { MessagingUdfWithEdgeValues<VertexKey, VertexValue, Message, EdgeValue> messenger = new MessagingUdfWithEdgeValues<VertexKey, VertexValue, Message, EdgeValue>(messagingFunction, messageTypeInfo); messages = this.edgesWithValue.coGroup(iteration.getWorkset()).where(0).equalTo(0).with(messenger); } // configure coGroup message function with name and broadcast variables messages = messages.name("Messaging"); for (Tuple2<String, DataSet<?>> e : this.bcVarsMessaging) { messages = messages.withBroadcastSet(e.f1, e.f0); } VertexUpdateUdf<VertexKey, VertexValue, Message> updateUdf = new VertexUpdateUdf<VertexKey, VertexValue, Message>(updateFunction, vertexTypes); // build the update function (co group) CoGroupOperator<?, ?, Tuple2<VertexKey, VertexValue>> updates = messages.coGroup(iteration.getSolutionSet()).where(0).equalTo(0).with(updateUdf); // configure coGroup update function with name and broadcast variables updates = updates.name("Vertex State Updates"); for (Tuple2<String, DataSet<?>> e : this.bcVarsUpdate) { updates = updates.withBroadcastSet(e.f1, e.f0); } // let the operator know that we preserve the key field updates.withConstantSetFirst("0").withConstantSetSecond("0"); return iteration.closeWith(updates, updates); } // -------------------------------------------------------------------------------------------- // Constructor builders to avoid signature conflicts with generic type erasure // -------------------------------------------------------------------------------------------- /** * Creates a new vertex-centric iteration operator for graphs where the edges are not associated with a value. * * @param edgesWithoutValue The data set containing edges. Edges are represented as 2-tuples: (source-id, target-id) * @param vertexUpdateFunction The function that updates the state of the vertices from the incoming messages. * @param messagingFunction The function that turns changed vertex states into messages along the edges. * * @param <VertexKey> The type of the vertex key (the vertex identifier). * @param <VertexValue> The type of the vertex value (the state of the vertex). * @param <Message> The type of the message sent between vertices along the edges. * * @return An in stance of the vertex-centric graph computation operator. */ public static final <VertexKey extends Comparable<VertexKey>, VertexValue, Message> VertexCentricIteration<VertexKey, VertexValue, Message, ?> withPlainEdges( DataSet<Tuple2<VertexKey, VertexKey>> edgesWithoutValue, VertexUpdateFunction<VertexKey, VertexValue, Message> vertexUpdateFunction, MessagingFunction<VertexKey, VertexValue, Message, ?> messagingFunction, int maximumNumberOfIterations) { @SuppressWarnings("unchecked") MessagingFunction<VertexKey, VertexValue, Message, Object> tmf = (MessagingFunction<VertexKey, VertexValue, Message, Object>) messagingFunction; return new VertexCentricIteration<VertexKey, VertexValue, Message, Object>(vertexUpdateFunction, tmf, edgesWithoutValue, maximumNumberOfIterations); } /** * Creates a new vertex-centric iteration operator for graphs where the edges are associated with a value (such as * a weight or distance). * * @param edgesWithValue The data set containing edges. Edges are represented as 2-tuples: (source-id, target-id) * @param uf The function that updates the state of the vertices from the incoming messages. * @param mf The function that turns changed vertex states into messages along the edges. * * @param <VertexKey> The type of the vertex key (the vertex identifier). * @param <VertexValue> The type of the vertex value (the state of the vertex). * @param <Message> The type of the message sent between vertices along the edges. * @param <EdgeValue> The type of the values that are associated with the edges. * * @return An in stance of the vertex-centric graph computation operator. */ public static final <VertexKey extends Comparable<VertexKey>, VertexValue, Message, EdgeValue> VertexCentricIteration<VertexKey, VertexValue, Message, EdgeValue> withValuedEdges( DataSet<Tuple3<VertexKey, VertexKey, EdgeValue>> edgesWithValue, VertexUpdateFunction<VertexKey, VertexValue, Message> uf, MessagingFunction<VertexKey, VertexValue, Message, EdgeValue> mf, int maximumNumberOfIterations) { return new VertexCentricIteration<VertexKey, VertexValue, Message, EdgeValue>(uf, mf, edgesWithValue, maximumNumberOfIterations, true); } // -------------------------------------------------------------------------------------------- // Wrapping UDFs // -------------------------------------------------------------------------------------------- private static final class VertexUpdateUdf<VertexKey extends Comparable<VertexKey>, VertexValue, Message> extends CoGroupFunction<Tuple2<VertexKey, Message>, Tuple2<VertexKey, VertexValue>, Tuple2<VertexKey, VertexValue>> implements ResultTypeQueryable<Tuple2<VertexKey, VertexValue>> { private static final long serialVersionUID = 1L; private final VertexUpdateFunction<VertexKey, VertexValue, Message> vertexUpdateFunction; private final MessageIterator<Message> messageIter = new MessageIterator<Message>(); private transient TypeInformation<Tuple2<VertexKey, VertexValue>> resultType; private VertexUpdateUdf(VertexUpdateFunction<VertexKey, VertexValue, Message> vertexUpdateFunction, TypeInformation<Tuple2<VertexKey, VertexValue>> resultType) { this.vertexUpdateFunction = vertexUpdateFunction; this.resultType = resultType; } @Override public void coGroup(Iterator<Tuple2<VertexKey, Message>> messages, Iterator<Tuple2<VertexKey, VertexValue>> vertex, Collector<Tuple2<VertexKey, VertexValue>> out) throws Exception { if (vertex.hasNext()) { Tuple2<VertexKey, VertexValue> vertexState = vertex.next(); @SuppressWarnings("unchecked") Iterator<Tuple2<?, Message>> downcastIter = (Iterator<Tuple2<?, Message>>) (Iterator<?>) messages; messageIter.setSource(downcastIter); vertexUpdateFunction.setOutput(vertexState, out); vertexUpdateFunction.updateVertex(vertexState.f0, vertexState.f1, messageIter); } else { if (messages.hasNext()) { String message = "Target vertex does not exist!."; try { Tuple2<VertexKey, Message> next = messages.next(); message = "Target vertex '" + next.f0 + "' does not exist!."; } catch (Throwable t) {} throw new Exception(message); } else { throw new Exception(); } } } @Override public void open(Configuration parameters) throws Exception { if (getIterationRuntimeContext().getSuperstepNumber() == 1) { this.vertexUpdateFunction.init(getIterationRuntimeContext()); } this.vertexUpdateFunction.preSuperstep(); } @Override public void close() throws Exception { this.vertexUpdateFunction.postSuperstep(); } @Override public TypeInformation<Tuple2<VertexKey, VertexValue>> getProducedType() { return this.resultType; } } /* * UDF that encapsulates the message sending function for graphs where the edges have no associated values. */ private static final class MessagingUdfNoEdgeValues<VertexKey extends Comparable<VertexKey>, VertexValue, Message> extends CoGroupFunction<Tuple2<VertexKey, VertexKey>, Tuple2<VertexKey, VertexValue>, Tuple2<VertexKey, Message>> implements ResultTypeQueryable<Tuple2<VertexKey, Message>> { private static final long serialVersionUID = 1L; private final MessagingFunction<VertexKey, VertexValue, Message, ?> messagingFunction; private transient TypeInformation<Tuple2<VertexKey, Message>> resultType; private MessagingUdfNoEdgeValues(MessagingFunction<VertexKey, VertexValue, Message, ?> messagingFunction, TypeInformation<Tuple2<VertexKey, Message>> resultType) { this.messagingFunction = messagingFunction; this.resultType = resultType; } @Override public void coGroup(Iterator<Tuple2<VertexKey, VertexKey>> edges, Iterator<Tuple2<VertexKey, VertexValue>> state, Collector<Tuple2<VertexKey, Message>> out) throws Exception { if (state.hasNext()) { Tuple2<VertexKey, VertexValue> newVertexState = state.next(); messagingFunction.set((Iterator<?>) edges, out); messagingFunction.sendMessages(newVertexState.f0, newVertexState.f1); } } @Override public void open(Configuration parameters) throws Exception { if (getIterationRuntimeContext().getSuperstepNumber() == 1) { this.messagingFunction.init(getIterationRuntimeContext(), false); } this.messagingFunction.preSuperstep(); } @Override public void close() throws Exception { this.messagingFunction.postSuperstep(); } @Override public TypeInformation<Tuple2<VertexKey, Message>> getProducedType() { return this.resultType; } } /* * UDF that encapsulates the message sending function for graphs where the edges have an associated value. */ private static final class MessagingUdfWithEdgeValues<VertexKey extends Comparable<VertexKey>, VertexValue, Message, EdgeValue> extends CoGroupFunction<Tuple3<VertexKey, VertexKey, EdgeValue>, Tuple2<VertexKey, VertexValue>, Tuple2<VertexKey, Message>> implements ResultTypeQueryable<Tuple2<VertexKey, Message>> { private static final long serialVersionUID = 1L; private final MessagingFunction<VertexKey, VertexValue, Message, EdgeValue> messagingFunction; private transient TypeInformation<Tuple2<VertexKey, Message>> resultType; private MessagingUdfWithEdgeValues(MessagingFunction<VertexKey, VertexValue, Message, EdgeValue> messagingFunction, TypeInformation<Tuple2<VertexKey, Message>> resultType) { this.messagingFunction = messagingFunction; this.resultType = resultType; } @Override public void coGroup(Iterator<Tuple3<VertexKey, VertexKey, EdgeValue>> edges, Iterator<Tuple2<VertexKey, VertexValue>> state, Collector<Tuple2<VertexKey, Message>> out) throws Exception { if (state.hasNext()) { Tuple2<VertexKey, VertexValue> newVertexState = state.next(); messagingFunction.set((Iterator<?>) edges, out); messagingFunction.sendMessages(newVertexState.f0, newVertexState.f1); } } @Override public void open(Configuration parameters) throws Exception { if (getIterationRuntimeContext().getSuperstepNumber() == 1) { this.messagingFunction.init(getIterationRuntimeContext(), true); } this.messagingFunction.preSuperstep(); } @Override public void close() throws Exception { this.messagingFunction.postSuperstep(); } @Override public TypeInformation<Tuple2<VertexKey, Message>> getProducedType() { return this.resultType; } } }