/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.graph.library.similarity; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.common.functions.GroupReduceFunction; import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.common.functions.RichGroupReduceFunction; import org.apache.flink.api.common.operators.Order; import org.apache.flink.api.common.operators.base.JoinOperatorBase.JoinHint; import org.apache.flink.api.java.DataSet; import org.apache.flink.api.java.functions.FunctionAnnotation.ForwardedFields; import org.apache.flink.api.java.operators.GroupReduceOperator; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.api.java.tuple.Tuple3; import org.apache.flink.api.java.tuple.Tuple4; import org.apache.flink.configuration.Configuration; import org.apache.flink.graph.Graph; import org.apache.flink.graph.Vertex; import org.apache.flink.graph.asm.degree.annotate.undirected.VertexDegree; import org.apache.flink.graph.asm.result.BinaryResult; import org.apache.flink.graph.asm.result.PrintableResult; import org.apache.flink.graph.library.similarity.AdamicAdar.Result; import org.apache.flink.graph.utils.Murmur3_32; import org.apache.flink.graph.utils.proxy.GraphAlgorithmWrappingDataSet; import org.apache.flink.types.CopyableValue; import org.apache.flink.types.FloatValue; import org.apache.flink.types.IntValue; import org.apache.flink.types.LongValue; import org.apache.flink.util.Collector; import org.apache.flink.util.Preconditions; import java.util.ArrayList; import java.util.Collection; import java.util.List; import static org.apache.flink.api.common.ExecutionConfig.PARALLELISM_DEFAULT; /** * http://social.cs.uiuc.edu/class/cs591kgk/friendsadamic.pdf * <p> * Adamic-Adar measures the similarity between pairs of vertices as the sum of * the inverse logarithm of degree over shared neighbors. Scores are non-negative * and unbounded. A vertex with higher degree has greater overall influence but * is less influential to each pair of neighbors. * <p> * This implementation produces similarity scores for each pair of vertices * in the graph with at least one shared neighbor; equivalently, this is the * set of all non-zero Adamic-Adar coefficients. * <p> * The input graph must be a simple, undirected graph containing no duplicate * edges or self-loops. * * @param <K> graph ID type * @param <VV> vertex value type * @param <EV> edge value type */ public class AdamicAdar<K extends CopyableValue<K>, VV, EV> extends GraphAlgorithmWrappingDataSet<K, VV, EV, Result<K>> { private static final int GROUP_SIZE = 64; private static final String SUM_OF_SCORES_AND_NUMBER_OF_NEIGHBOR_PAIRS = "sum of scores and number of vertices"; // Optional configuration private float minimumScore = 0.0f; private float minimumRatio = 0.0f; private int littleParallelism = PARALLELISM_DEFAULT; /** * Filter out Adamic-Adar scores less than the given minimum. * * @param score minimum score * @return this */ public AdamicAdar<K, VV, EV> setMinimumScore(float score) { Preconditions.checkArgument(score >= 0, "Minimum score must be non-negative"); this.minimumScore = score; return this; } /** * Filter out Adamic-Adar scores less than the given ratio times the average score. * * @param ratio minimum ratio * @return this */ public AdamicAdar<K, VV, EV> setMinimumRatio(float ratio) { Preconditions.checkArgument(ratio >= 0, "Minimum ratio must be non-negative"); this.minimumRatio = ratio; return this; } /** * Override the parallelism of operators processing small amounts of data. * * @param littleParallelism operator parallelism * @return this */ public AdamicAdar<K, VV, EV> setLittleParallelism(int littleParallelism) { Preconditions.checkArgument(littleParallelism > 0 || littleParallelism == PARALLELISM_DEFAULT, "The parallelism must be greater than zero."); this.littleParallelism = littleParallelism; return this; } @Override protected String getAlgorithmName() { return AdamicAdar.class.getName(); } @Override protected boolean mergeConfiguration(GraphAlgorithmWrappingDataSet other) { Preconditions.checkNotNull(other); if (! AdamicAdar.class.isAssignableFrom(other.getClass())) { return false; } AdamicAdar rhs = (AdamicAdar) other; // verify that configurations can be merged if (minimumRatio != rhs.minimumRatio || minimumScore != rhs.minimumScore) { return false; } // merge configurations littleParallelism = (littleParallelism == PARALLELISM_DEFAULT) ? rhs.littleParallelism : ((rhs.littleParallelism == PARALLELISM_DEFAULT) ? littleParallelism : Math.min(littleParallelism, rhs.littleParallelism)); return true; } /* * Implementation notes: * * The requirement that "K extends CopyableValue<K>" can be removed when * Flink has a self-join which performs the skew distribution handled by * GenerateGroupSpans / GenerateGroups / GenerateGroupPairs. */ @Override public DataSet<Result<K>> runInternal(Graph<K, VV, EV> input) throws Exception { // s, d(s), 1/log(d(s)) DataSet<Tuple3<K, LongValue, FloatValue>> inverseLogDegree = input .run(new VertexDegree<K, VV, EV>() .setParallelism(littleParallelism)) .map(new VertexInverseLogDegree<K>()) .setParallelism(littleParallelism) .name("Vertex score"); // s, t, 1/log(d(s)) DataSet<Tuple3<K, K, FloatValue>> sourceInverseLogDegree = input .getEdges() .join(inverseLogDegree, JoinHint.REPARTITION_HASH_SECOND) .where(0) .equalTo(0) .projectFirst(0, 1) .<Tuple3<K, K, FloatValue>>projectSecond(2) .setParallelism(littleParallelism) .name("Edge score"); // group span, s, t, 1/log(d(s)) DataSet<Tuple4<IntValue, K, K, FloatValue>> groupSpans = sourceInverseLogDegree .groupBy(0) .sortGroup(1, Order.ASCENDING) .reduceGroup(new GenerateGroupSpans<K>()) .setParallelism(littleParallelism) .name("Generate group spans"); // group, s, t, 1/log(d(s)) DataSet<Tuple4<IntValue, K, K, FloatValue>> groups = groupSpans .rebalance() .setParallelism(littleParallelism) .name("Rebalance") .flatMap(new GenerateGroups<K>()) .setParallelism(littleParallelism) .name("Generate groups"); // t, u, 1/log(d(s)) where (s, t) and (s, u) are edges in graph DataSet<Tuple3<K, K, FloatValue>> twoPaths = groups .groupBy(0, 1) .sortGroup(2, Order.ASCENDING) .reduceGroup(new GenerateGroupPairs<K>()) .name("Generate group pairs"); // t, u, adamic-adar score GroupReduceOperator<Tuple3<K, K, FloatValue>, Result<K>> scores = twoPaths .groupBy(0, 1) .reduceGroup(new ComputeScores<K>(minimumScore, minimumRatio)) .name("Compute scores"); if (minimumRatio > 0.0f) { // total score, number of pairs of neighbors DataSet<Tuple2<FloatValue, LongValue>> sumOfScoresAndNumberOfNeighborPairs = inverseLogDegree .map(new ComputeScoreFromVertex<K>()) .setParallelism(littleParallelism) .name("Average score") .sum(0) .andSum(1); scores .withBroadcastSet(sumOfScoresAndNumberOfNeighborPairs, SUM_OF_SCORES_AND_NUMBER_OF_NEIGHBOR_PAIRS); } return scores; } /** * Compute the inverse logarithm of the vertex degree. This is computed * before enumerating neighbor pairs since logarithm and division are quite * computationally intensive. * * @param <T> ID type */ @ForwardedFields("0; 1") private static class VertexInverseLogDegree<T> implements MapFunction<Vertex<T, LongValue>, Tuple3<T, LongValue, FloatValue>> { private Tuple3<T, LongValue, FloatValue> output = new Tuple3<>(null, null, new FloatValue()); @Override public Tuple3<T, LongValue, FloatValue> map(Vertex<T, LongValue> value) throws Exception { output.f0 = value.f0; output.f1 = value.f1; long degree = value.f1.getValue(); // when the degree is one the logarithm is zero so avoid dividing by this value float inverseLogDegree = (degree == 1) ? 0.0f : 1.0f / (float)Math.log(value.f1.getValue()); output.f2.setValue(inverseLogDegree); return output; } } /** * @see JaccardIndex.GenerateGroupSpans * * @param <T> ID type */ @ForwardedFields("0->1; 1->2 ; 2->3") private static class GenerateGroupSpans<T> implements GroupReduceFunction<Tuple3<T, T, FloatValue>, Tuple4<IntValue, T, T, FloatValue>> { private IntValue groupSpansValue = new IntValue(); private Tuple4<IntValue, T, T, FloatValue> output = new Tuple4<>(groupSpansValue, null, null, null); @Override public void reduce(Iterable<Tuple3<T, T, FloatValue>> values, Collector<Tuple4<IntValue, T, T, FloatValue>> out) throws Exception { int groupCount = 0; int groupSpans = 1; groupSpansValue.setValue(groupSpans); for (Tuple3<T, T, FloatValue> edge : values) { output.f1 = edge.f0; output.f2 = edge.f1; output.f3 = edge.f2; out.collect(output); if (++groupCount == GROUP_SIZE) { groupCount = 0; groupSpansValue.setValue(++groupSpans); } } } } /** * @see JaccardIndex.GenerateGroups * * @param <T> ID type */ @ForwardedFields("1; 2; 3") private static class GenerateGroups<T> implements FlatMapFunction<Tuple4<IntValue, T, T, FloatValue>, Tuple4<IntValue, T, T, FloatValue>> { @Override public void flatMap(Tuple4<IntValue, T, T, FloatValue> value, Collector<Tuple4<IntValue, T, T, FloatValue>> out) throws Exception { int spans = value.f0.getValue(); for (int idx = 0 ; idx < spans ; idx++ ) { value.f0.setValue(idx); out.collect(value); } } } /** * @see JaccardIndex.GenerateGroupPairs * * @param <T> ID type */ @ForwardedFields("3->2") private static class GenerateGroupPairs<T extends CopyableValue<T>> implements GroupReduceFunction<Tuple4<IntValue, T, T, FloatValue>, Tuple3<T, T, FloatValue>> { private Tuple3<T, T, FloatValue> output = new Tuple3<>(); private boolean initialized = false; private List<T> visited = new ArrayList<>(GROUP_SIZE); @Override public void reduce(Iterable<Tuple4<IntValue, T, T, FloatValue>> values, Collector<Tuple3<T, T, FloatValue>> out) throws Exception { int visitedCount = 0; for (Tuple4<IntValue, T, T, FloatValue> edge : values) { output.f1 = edge.f2; output.f2 = edge.f3; for (int i = 0 ; i < visitedCount ; i++) { output.f0 = visited.get(i); out.collect(output); } if (visitedCount < GROUP_SIZE) { if (! initialized) { initialized = true; for (int i = 0 ; i < GROUP_SIZE ; i++) { visited.add(edge.f2.copy()); } } else { edge.f2.copyTo(visited.get(visitedCount)); } visitedCount += 1; } } } } /** * Compute the sum of scores emitted by the vertex over all pairs of neighbors. * * @param <T> ID type */ private static class ComputeScoreFromVertex<T> implements MapFunction<Tuple3<T, LongValue, FloatValue>, Tuple2<FloatValue, LongValue>> { private FloatValue sumOfScores = new FloatValue(); private LongValue numberOfNeighborPairs = new LongValue(); private Tuple2<FloatValue, LongValue> output = new Tuple2<>(sumOfScores, numberOfNeighborPairs); @Override public Tuple2<FloatValue, LongValue> map(Tuple3<T, LongValue, FloatValue> value) throws Exception { long degree = value.f1.getValue(); long neighborPairs = degree * (degree - 1) / 2; sumOfScores.setValue(value.f2.getValue() * neighborPairs); numberOfNeighborPairs.setValue(neighborPairs); return output; } } /** * Compute the Adamic-Adar similarity as the sum over common neighbors of * the inverse logarithm of degree. * * @param <T> ID type */ @ForwardedFields("0; 1") private static class ComputeScores<T> extends RichGroupReduceFunction<Tuple3<T, T, FloatValue>, Result<T>> { private float minimumScore; private float minimumRatio; private Result<T> output = new Result<>(); public ComputeScores(float minimumScore, float minimumRatio) { this.minimumScore = minimumScore; this.minimumRatio = minimumRatio; } @Override public void open(Configuration parameters) throws Exception { super.open(parameters); if (minimumRatio > 0.0f) { Collection<Tuple2<FloatValue, LongValue>> var; var = getRuntimeContext().getBroadcastVariable(SUM_OF_SCORES_AND_NUMBER_OF_NEIGHBOR_PAIRS); Tuple2<FloatValue, LongValue> sumAndCount = var.iterator().next(); float averageScore = sumAndCount.f0.getValue() / sumAndCount.f1.getValue(); minimumScore = Math.max(minimumScore, averageScore * minimumRatio); } } @Override public void reduce(Iterable<Tuple3<T, T, FloatValue>> values, Collector<Result<T>> out) throws Exception { float sum = 0; Tuple3<T, T, FloatValue> edge = null; for (Tuple3<T, T, FloatValue> next : values) { edge = next; sum += next.f2.getValue(); } if (sum >= minimumScore) { output.f0 = edge.f0; output.f1 = edge.f1; output.f2.setValue(sum); out.collect(output); } } } /** * Wraps {@link Tuple3} to encapsulate results from the Adamic-Adar algorithm. * * @param <T> ID type */ public static class Result<T> extends Tuple3<T, T, FloatValue> implements PrintableResult, BinaryResult<T>, Comparable<Result<T>> { public static final int HASH_SEED = 0xe405f6d1; private Murmur3_32 hasher = new Murmur3_32(HASH_SEED); /** * No-args constructor. */ public Result() { f2 = new FloatValue(); } @Override public T getVertexId0() { return f0; } @Override public void setVertexId0(T value) { f0 = value; } @Override public T getVertexId1() { return f1; } @Override public void setVertexId1(T value) { f1 = value; } /** * Get the Adamic-Adar score, equal to the sum over common neighbors of * the inverse logarithm of degree * * @return Adamic-Adar score */ public FloatValue getAdamicAdarScore() { return f2; } @Override public String toPrintableString() { return "Vertex IDs: (" + getVertexId0() + ", " + getVertexId1() + "), adamic-adar score: " + getAdamicAdarScore(); } @Override public int hashCode() { return hasher.reset() .hash(f0.hashCode()) .hash(f1.hashCode()) .hash(f2.getValue()) .hash(); } @Override public int compareTo(Result<T> o) { return Float.compare(getAdamicAdarScore().getValue(), o.getAdamicAdarScore().getValue()); } } }