/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.graph.library;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.GroupReduceFunction;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.RichGroupReduceFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.functions.FunctionAnnotation;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.graph.Edge;
import org.apache.flink.graph.Graph;
import org.apache.flink.graph.GraphAlgorithm;
import org.apache.flink.graph.Vertex;
import org.apache.flink.types.Either;
import org.apache.flink.types.NullValue;
import org.apache.flink.util.Collector;
/**
* The summarization algorithm computes a condensed version of the input graph<br>
* by grouping vertices and edges based on their values. By doing this, the<br>
* algorithm helps to uncover insights about patterns and distributions in the<br>
* graph.
* <p>
* In the resulting graph, each vertex represents a group of vertices that share the<br>
* same vertex value. An edge, that connects a vertex with itself, represents all edges<br>
* with the same edge value that connect vertices inside that group. An edge between<br>
* vertices in the output graph represents all edges with the same edge value between<br>
* members of those groups in the input graph.
* <p>
* Consider the following example:
* <p>
* Input graph:
* <p>
* Vertices (id, value):<br>
* (0, "A")<br>
* (1, "A")<br>
* (2, "B")<br>
* (3, "B")<br>
* <p>
* Edges (source, target, value):
* (0,1, null)<br>
* (1,0, null)<br>
* (1,2, null)<br>
* (2,1, null)<br>
* (2,3, null)<br>
* (3,2, null)<br>
* <p>
* Output graph:
* <p>
* Vertices (id, (value, count)):<br>
* (0, ("A", 2)) // 0 and 1 <br>
* (2, ("B", 2)) // 2 and 3 <br>
* <p>
* Edges (source, target, (value, count)):<br>
* (0, 0, (null, 2)) // (0,1) and (1,0) <br>
* (2, 2, (null, 2)) // (2,3) and (3,2) <br>
* (0, 2, (null, 1)) // (1,2) <br>
* (2, 0, (null, 1)) // (2,1) <br>
*
* Note that this implementation is non-deterministic in the way that it assigns<br>
* identifiers to summarized vertices. However, it is guaranteed that the identifier<br>
* is one of the represented vertex identifiers.
*
* @param <K> vertex identifier type
* @param <VV> vertex value type
* @param <EV> edge value type
*/
public class Summarization<K, VV, EV>
implements GraphAlgorithm<K, VV, EV,
Graph<K, Summarization.VertexValue<VV>, Summarization.EdgeValue<EV>>> {
@Override
public Graph<K, VertexValue<VV>, EdgeValue<EV>> run(Graph<K, VV, EV> input) throws Exception {
// -------------------------
// build super vertices
// -------------------------
// group vertices by value and create vertex group items
DataSet<VertexGroupItem<K, VV>> vertexGroupItems = input.getVertices()
.groupBy(1)
.reduceGroup(new VertexGroupReducer<K, VV>());
// create super vertices
DataSet<Vertex<K, VertexValue<VV>>> summarizedVertices = vertexGroupItems
.filter(new VertexGroupItemToSummarizedVertexFilter<K, VV>())
.map(new VertexGroupItemToSummarizedVertexMapper<K, VV>());
// -------------------------
// build super edges
// -------------------------
// create mapping between vertices and their representative
DataSet<VertexWithRepresentative<K>> vertexToRepresentativeMap = vertexGroupItems
.filter(new VertexGroupItemToRepresentativeFilter<K, VV>())
.map(new VertexGroupItemToVertexWithRepresentativeMapper<K, VV>());
// join edges with vertex representatives and update source and target identifiers
DataSet<Edge<K, EV>> edgesForGrouping = input.getEdges()
.join(vertexToRepresentativeMap)
.where(0) // source vertex id
.equalTo(0) // vertex id
.with(new SourceVertexJoinFunction<K, EV>())
.join(vertexToRepresentativeMap)
.where(1) // target vertex id
.equalTo(0) // vertex id
.with(new TargetVertexJoinFunction<K, EV>());
// create super edges
DataSet<Edge<K, EdgeValue<EV>>> summarizedEdges = edgesForGrouping
.groupBy(0, 1, 2) // group by source id (0), target id (1) and edge value (2)
.reduceGroup(new EdgeGroupReducer<K, EV>());
return Graph.fromDataSet(summarizedVertices, summarizedEdges, input.getContext());
}
// --------------------------------------------------------------------------------------------
// Tuple Types
// --------------------------------------------------------------------------------------------
/**
* Value that is stored at a summarized vertex.
*
* f0: vertex group value
* f1: vertex group count
*
* @param <VV> vertex value type
*/
@SuppressWarnings("serial")
public static final class VertexValue<VV> extends Tuple2<VV, Long> {
public VV getVertexGroupValue() {
return f0;
}
public void setVertexGroupValue(VV vertexGroupValue) {
f0 = vertexGroupValue;
}
public Long getVertexGroupCount() {
return f1;
}
public void setVertexGroupCount(Long vertexGroupCount) {
f1 = vertexGroupCount;
}
}
/**
* Value that is stored at a summarized edge.
*
* f0: edge group value
* f1: edge group count
*
* @param <EV> edge value type
*/
@SuppressWarnings("serial")
public static final class EdgeValue<EV> extends Tuple2<EV, Long> {
public EV getEdgeGroupValue() {
return f0;
}
public void setEdgeGroupValue(EV edgeGroupValue) {
f0 = edgeGroupValue;
}
public Long getEdgeGroupCount() {
return f1;
}
public void setEdgeGroupCount(Long edgeGroupCount) {
f1 = edgeGroupCount;
}
}
/**
* Represents a single vertex in a vertex group.
*
* f0: vertex identifier
* f1: vertex group representative identifier
* f2: vertex group value
* f3: vertex group count
*
* @param <K> vertex identifier type
* @param <VGV> vertex group value type
*/
@SuppressWarnings("serial")
public static final class VertexGroupItem<K, VGV> extends Tuple4<K, K, Either<VGV, NullValue>, Long> {
private final Either.Right<VGV, NullValue> nullValue = new Either.Right<>(NullValue.getInstance());
public VertexGroupItem() {
reset();
}
public K getVertexId() {
return f0;
}
public void setVertexId(K vertexId) {
f0 = vertexId;
}
public K getGroupRepresentativeId() {
return f1;
}
public void setGroupRepresentativeId(K groupRepresentativeId) {
f1 = groupRepresentativeId;
}
public VGV getVertexGroupValue() {
return f2.isLeft() ? f2.left() : null;
}
public void setVertexGroupValue(VGV vertexGroupValue) {
if (vertexGroupValue == null) {
f2 = nullValue;
} else {
f2 = new Either.Left<>(vertexGroupValue);
}
}
public Long getVertexGroupCount() {
return f3;
}
public void setVertexGroupCount(Long vertexGroupCount) {
f3 = vertexGroupCount;
}
/**
* Resets the fields to initial values. This is necessary if the tuples are reused and not all fields were modified.
*/
public void reset() {
f0 = null;
f1 = null;
f2 = nullValue;
f3 = 0L;
}
}
/**
* Represents a vertex identifier and its corresponding vertex group identifier.
*
* @param <K> vertex identifier type
*/
@SuppressWarnings("serial")
public static final class VertexWithRepresentative<K> extends Tuple2<K, K> {
public void setVertexId(K vertexId) {
f0 = vertexId;
}
public K getGroupRepresentativeId() {
return f1;
}
public void setGroupRepresentativeId(K groupRepresentativeId) {
f1 = groupRepresentativeId;
}
}
// --------------------------------------------------------------------------------------------
// Functions
// --------------------------------------------------------------------------------------------
/**
* Creates one {@link VertexGroupItem} for each group element containing the vertex identifier and the identifier
* of the group representative which is the first vertex in the reduce input iterable.
*
* Creates one {@link VertexGroupItem} representing the whole group that contains the vertex identifier of the
* group representative, the vertex group value and the total number of group elements.
*
* @param <K> vertex identifier type
* @param <VV> vertex value type
*/
@SuppressWarnings("serial")
private static final class VertexGroupReducer<K, VV>
extends RichGroupReduceFunction<Vertex<K, VV>, VertexGroupItem<K, VV>> {
private transient VertexGroupItem<K, VV> reuseVertexGroupItem;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
this.reuseVertexGroupItem = new VertexGroupItem<>();
}
@Override
public void reduce(Iterable<Vertex<K, VV>> values, Collector<VertexGroupItem<K, VV>> out) throws Exception {
K vertexGroupRepresentativeID = null;
long vertexGroupCount = 0L;
VV vertexGroupValue = null;
boolean isFirstElement = true;
for (Vertex<K, VV> vertex : values) {
if (isFirstElement) {
// take final group representative vertex id from first tuple
vertexGroupRepresentativeID = vertex.getId();
vertexGroupValue = vertex.getValue();
isFirstElement = false;
}
// no need to set group value for those tuples
reuseVertexGroupItem.setVertexId(vertex.getId());
reuseVertexGroupItem.setGroupRepresentativeId(vertexGroupRepresentativeID);
out.collect(reuseVertexGroupItem);
vertexGroupCount++;
}
createGroupRepresentativeTuple(vertexGroupRepresentativeID, vertexGroupValue, vertexGroupCount);
out.collect(reuseVertexGroupItem);
reuseVertexGroupItem.reset();
}
/**
* Creates one tuple representing the whole group. This tuple is later used to create a summarized vertex for each
* group.
*
* @param vertexGroupRepresentativeId group representative vertex identifier
* @param vertexGroupValue group property value
* @param vertexGroupCount total group count
*/
private void createGroupRepresentativeTuple(K vertexGroupRepresentativeId, VV vertexGroupValue, Long vertexGroupCount) {
reuseVertexGroupItem.setVertexId(vertexGroupRepresentativeId);
reuseVertexGroupItem.setVertexGroupValue(vertexGroupValue);
reuseVertexGroupItem.setVertexGroupCount(vertexGroupCount);
}
}
/**
* Creates a summarized edge from a group of edges. Counts the number of elements in the group.
*
* @param <K> vertex identifier type
* @param <EV> edge group value type
*/
@SuppressWarnings("serial")
private static final class EdgeGroupReducer<K, EV>
implements GroupReduceFunction<Edge<K, EV>, Edge<K, EdgeValue<EV>>> {
private final Edge<K, EdgeValue<EV>> reuseEdge;
private final EdgeValue<EV> reuseEdgeValue;
private EdgeGroupReducer() {
reuseEdge = new Edge<>();
reuseEdgeValue = new EdgeValue<>();
}
@Override
public void reduce(Iterable<Edge<K, EV>> values, Collector<Edge<K, EdgeValue<EV>>> out) throws Exception {
K sourceVertexId = null;
K targetVertexId = null;
EV edgeGroupValue = null;
Long edgeGroupCount = 0L;
boolean isFirstElement = true;
for (Edge<K, EV> edge : values) {
if (isFirstElement) {
sourceVertexId = edge.getSource();
targetVertexId = edge.getTarget();
edgeGroupValue = edge.getValue();
isFirstElement = false;
}
edgeGroupCount++;
}
reuseEdgeValue.setEdgeGroupValue(edgeGroupValue);
reuseEdgeValue.setEdgeGroupCount(edgeGroupCount);
reuseEdge.setSource(sourceVertexId);
reuseEdge.setTarget(targetVertexId);
reuseEdge.setValue(reuseEdgeValue);
out.collect(reuseEdge);
}
}
/**
* Filter tuples that are representing a vertex group. They are used to create new summarized vertices and have a
* group count greater than zero.
*
* @param <K> vertex identifier type
* @param <VV> vertex value type
*/
@SuppressWarnings("serial")
@FunctionAnnotation.ForwardedFields("*->*")
private static final class VertexGroupItemToSummarizedVertexFilter<K, VV>
implements FilterFunction<VertexGroupItem<K, VV>> {
@Override
public boolean filter(VertexGroupItem<K, VV> vertexGroupItem) throws Exception {
return !vertexGroupItem.getVertexGroupCount().equals(0L);
}
}
/**
* Filter tuples that are representing a single vertex. They are used to update the source and target vertex
* identifiers at the edges.
*
* @param <K> vertex identifier type
* @param <VV> vertex value type
*/
@SuppressWarnings("serial")
@FunctionAnnotation.ForwardedFields("*->*")
private static final class VertexGroupItemToRepresentativeFilter<K, VV>
implements FilterFunction<VertexGroupItem<K, VV>> {
@Override
public boolean filter(VertexGroupItem<K, VV> vertexGroupItem) throws Exception {
return vertexGroupItem.getVertexGroupCount().equals(0L);
}
}
/**
* Creates a new vertex representing a vertex group. The vertex stores the group value and the number of vertices in
* the group.
*
* @param <K> vertex identifier type
* @param <VV> vertex value type
*/
@SuppressWarnings("serial")
private static final class VertexGroupItemToSummarizedVertexMapper<K, VV>
implements MapFunction<VertexGroupItem<K, VV>, Vertex<K, VertexValue<VV>>> {
private final VertexValue<VV> reuseSummarizedVertexValue;
private VertexGroupItemToSummarizedVertexMapper() {
reuseSummarizedVertexValue = new VertexValue<>();
}
@Override
public Vertex<K, VertexValue<VV>> map(VertexGroupItem<K, VV> value) throws Exception {
K vertexId = value.getVertexId();
reuseSummarizedVertexValue.setVertexGroupValue(value.getVertexGroupValue());
reuseSummarizedVertexValue.setVertexGroupCount(value.getVertexGroupCount());
return new Vertex<>(vertexId, reuseSummarizedVertexValue);
}
}
/**
* Creates a {@link VertexWithRepresentative} from a {@link VertexGroupItem}.
*
* @param <K> vertex identifier type
* @param <VV> vertex value type
*/
@SuppressWarnings("serial")
@FunctionAnnotation.ForwardedFields("f0;f1")
private static final class VertexGroupItemToVertexWithRepresentativeMapper<K, VV>
implements MapFunction<VertexGroupItem<K, VV>, VertexWithRepresentative<K>> {
private final VertexWithRepresentative<K> reuseVertexWithRepresentative;
private VertexGroupItemToVertexWithRepresentativeMapper() {
reuseVertexWithRepresentative = new VertexWithRepresentative<>();
}
@Override
public VertexWithRepresentative<K> map(VertexGroupItem<K, VV> vertexGroupItem) throws Exception {
reuseVertexWithRepresentative.setVertexId(vertexGroupItem.getVertexId());
reuseVertexWithRepresentative.setGroupRepresentativeId(vertexGroupItem.getGroupRepresentativeId());
return reuseVertexWithRepresentative;
}
}
/**
* Replaces the source vertex id with the vertex group representative id and adds the edge group value.
*
* @param <K> vertex identifier type
* @param <EV> edge value type
*/
@SuppressWarnings("serial")
@FunctionAnnotation.ForwardedFieldsFirst("f1") // edge target id
@FunctionAnnotation.ForwardedFieldsSecond("f1->f0") // vertex group id -> edge source id
private static final class SourceVertexJoinFunction<K, EV>
implements JoinFunction<Edge<K, EV>, VertexWithRepresentative<K>, Edge<K, EV>> {
private final Edge<K, EV> reuseEdge;
private SourceVertexJoinFunction() {
this.reuseEdge = new Edge<>();
}
@Override
public Edge<K, EV> join(Edge<K, EV> edge, VertexWithRepresentative<K> vertex) throws Exception {
reuseEdge.setSource(vertex.getGroupRepresentativeId());
reuseEdge.setTarget(edge.getTarget());
reuseEdge.setValue(edge.getValue());
return reuseEdge;
}
}
/**
* Replaces the target vertex id with the vertex group identifier.
*
* @param <K> vertex identifier type
* @param <EV> edge group value type
*/
@SuppressWarnings("serial")
@FunctionAnnotation.ForwardedFieldsFirst("f0;f2") // source vertex id, edge group value
@FunctionAnnotation.ForwardedFieldsSecond("f1") // vertex group id -> edge target id
private static final class TargetVertexJoinFunction<K, EV>
implements JoinFunction<Edge<K, EV>, VertexWithRepresentative<K>, Edge<K, EV>> {
@Override
public Edge<K, EV> join(Edge<K, EV> edge, VertexWithRepresentative<K> vertexRepresentative) throws Exception {
edge.setTarget(vertexRepresentative.getGroupRepresentativeId());
return edge;
}
}
}