/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.graph.library.link_analysis; import org.apache.flink.api.common.aggregators.ConvergenceCriterion; import org.apache.flink.api.common.aggregators.DoubleSumAggregator; import org.apache.flink.api.common.functions.CoGroupFunction; import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.common.functions.ReduceFunction; import org.apache.flink.api.common.functions.RichJoinFunction; import org.apache.flink.api.common.operators.base.JoinOperatorBase.JoinHint; import org.apache.flink.api.common.operators.base.ReduceOperatorBase.CombineHint; import org.apache.flink.api.java.DataSet; import org.apache.flink.api.java.functions.FunctionAnnotation.ForwardedFields; import org.apache.flink.api.java.functions.FunctionAnnotation.ForwardedFieldsFirst; import org.apache.flink.api.java.functions.FunctionAnnotation.ForwardedFieldsSecond; import org.apache.flink.api.java.operators.IterativeDataSet; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.api.java.tuple.Tuple3; import org.apache.flink.configuration.Configuration; import org.apache.flink.graph.Edge; import org.apache.flink.graph.Graph; import org.apache.flink.graph.asm.result.PrintableResult; import org.apache.flink.graph.asm.result.UnaryResult; import org.apache.flink.graph.library.link_analysis.Functions.SumScore; import org.apache.flink.graph.library.link_analysis.HITS.Result; import org.apache.flink.graph.utils.Murmur3_32; import org.apache.flink.graph.utils.proxy.GraphAlgorithmWrappingDataSet; import org.apache.flink.types.DoubleValue; import org.apache.flink.util.Collector; import org.apache.flink.util.Preconditions; import java.util.Collection; import static org.apache.flink.api.common.ExecutionConfig.PARALLELISM_DEFAULT; /** * Hyperlink-Induced Topic Search computes two interdependent scores for every * vertex in a directed graph. A good "hub" links to good "authorities" and * good "authorities" are linked from good "hubs". * <p> * This algorithm can be configured to terminate either by a limit on the number * of iterations, a convergence threshold, or both. * <p> * http://www.cs.cornell.edu/home/kleinber/auth.pdf * * @param <K> graph ID type * @param <VV> vertex value type * @param <EV> edge value type */ public class HITS<K, VV, EV> extends GraphAlgorithmWrappingDataSet<K, VV, EV, Result<K>> { private static final String CHANGE_IN_SCORES = "change in scores"; private static final String HUBBINESS_SUM_SQUARED = "hubbiness sum squared"; private static final String AUTHORITY_SUM_SQUARED = "authority sum squared"; // Required configuration private int maxIterations; private double convergenceThreshold; // Optional configuration private int parallelism = PARALLELISM_DEFAULT; /** * Hyperlink-Induced Topic Search with a fixed number of iterations. * * @param iterations fixed number of iterations */ public HITS(int iterations) { this(iterations, Double.MAX_VALUE); } /** * Hyperlink-Induced Topic Search with a convergence threshold. The algorithm * terminates when the total change in hub and authority scores over all * vertices falls to or below the given threshold value. * * @param convergenceThreshold convergence threshold for sum of scores */ public HITS(double convergenceThreshold) { this(Integer.MAX_VALUE, convergenceThreshold); } /** * Hyperlink-Induced Topic Search with a convergence threshold and a maximum * iteration count. The algorithm terminates after either the given number * of iterations or when the total change in hub and authority scores over all * vertices falls to or below the given threshold value. * * @param maxIterations maximum number of iterations * @param convergenceThreshold convergence threshold for sum of scores */ public HITS(int maxIterations, double convergenceThreshold) { Preconditions.checkArgument(maxIterations > 0, "Number of iterations must be greater than zero"); Preconditions.checkArgument(convergenceThreshold > 0.0, "Convergence threshold must be greater than zero"); this.maxIterations = maxIterations; this.convergenceThreshold = convergenceThreshold; } /** * Override the operator parallelism. * * @param parallelism operator parallelism * @return this */ public HITS<K, VV, EV> setParallelism(int parallelism) { this.parallelism = parallelism; return this; } @Override protected String getAlgorithmName() { return HITS.class.getName(); } @Override protected boolean mergeConfiguration(GraphAlgorithmWrappingDataSet other) { Preconditions.checkNotNull(other); if (! HITS.class.isAssignableFrom(other.getClass())) { return false; } HITS rhs = (HITS) other; // merge configurations maxIterations = Math.max(maxIterations, rhs.maxIterations); convergenceThreshold = Math.min(convergenceThreshold, rhs.convergenceThreshold); parallelism = (parallelism == PARALLELISM_DEFAULT) ? rhs.parallelism : ((rhs.parallelism == PARALLELISM_DEFAULT) ? parallelism : Math.min(parallelism, rhs.parallelism)); return true; } @Override public DataSet<Result<K>> runInternal(Graph<K, VV, EV> input) throws Exception { DataSet<Tuple2<K, K>> edges = input .getEdges() .map(new ExtractEdgeIDs<K, EV>()) .setParallelism(parallelism) .name("Extract edge IDs"); // ID, hub, authority DataSet<Tuple3<K, DoubleValue, DoubleValue>> initialScores = edges .map(new InitializeScores<K>()) .setParallelism(parallelism) .name("Initial scores") .groupBy(0) .reduce(new SumScores<K>()) .setCombineHint(CombineHint.HASH) .setParallelism(parallelism) .name("Sum"); IterativeDataSet<Tuple3<K, DoubleValue, DoubleValue>> iterative = initialScores .iterate(maxIterations); // ID, hubbiness DataSet<Tuple2<K, DoubleValue>> hubbiness = iterative .coGroup(edges) .where(0) .equalTo(1) .with(new Hubbiness<K>()) .setParallelism(parallelism) .name("Hub") .groupBy(0) .reduce(new SumScore<K>()) .setCombineHint(CombineHint.HASH) .setParallelism(parallelism) .name("Sum"); // sum-of-hubbiness-squared DataSet<DoubleValue> hubbinessSumSquared = hubbiness .map(new Square<K>()) .setParallelism(parallelism) .name("Square") .reduce(new Sum()) .setCombineHint(CombineHint.HASH) .setParallelism(parallelism) .name("Sum"); // ID, new authority DataSet<Tuple2<K, DoubleValue>> authority = hubbiness .coGroup(edges) .where(0) .equalTo(0) .with(new Authority<K>()) .setParallelism(parallelism) .name("Authority") .groupBy(0) .reduce(new SumScore<K>()) .setCombineHint(CombineHint.HASH) .setParallelism(parallelism) .name("Sum"); // sum-of-authority-squared DataSet<DoubleValue> authoritySumSquared = authority .map(new Square<K>()) .setParallelism(parallelism) .name("Square") .reduce(new Sum()) .setCombineHint(CombineHint.HASH) .setParallelism(parallelism) .name("Sum"); // ID, normalized hubbiness, normalized authority DataSet<Tuple3<K, DoubleValue, DoubleValue>> scores = hubbiness .fullOuterJoin(authority, JoinHint.REPARTITION_SORT_MERGE) .where(0) .equalTo(0) .with(new JoinAndNormalizeHubAndAuthority<K>()) .withBroadcastSet(hubbinessSumSquared, HUBBINESS_SUM_SQUARED) .withBroadcastSet(authoritySumSquared, AUTHORITY_SUM_SQUARED) .setParallelism(parallelism) .name("Join scores"); DataSet<Tuple3<K, DoubleValue, DoubleValue>> passThrough; if (convergenceThreshold < Double.MAX_VALUE) { passThrough = iterative .fullOuterJoin(scores, JoinHint.REPARTITION_SORT_MERGE) .where(0) .equalTo(0) .with(new ChangeInScores<K>()) .setParallelism(parallelism) .name("Change in scores"); iterative.registerAggregationConvergenceCriterion(CHANGE_IN_SCORES, new DoubleSumAggregator(), new ScoreConvergence(convergenceThreshold)); } else { passThrough = scores; } return iterative .closeWith(passThrough) .map(new TranslateResult<K>()) .setParallelism(parallelism) .name("Map result"); } /** * Map edges and remove the edge value. * * @param <T> ID type * @param <ET> edge value type * * @see Graph.ExtractEdgeIDsMapper */ @ForwardedFields("0; 1") private static class ExtractEdgeIDs<T, ET> implements MapFunction<Edge<T, ET>, Tuple2<T, T>> { private Tuple2<T, T> output = new Tuple2<>(); @Override public Tuple2<T, T> map(Edge<T, ET> value) throws Exception { output.f0 = value.f0; output.f1 = value.f1; return output; } } /** * Initialize vertices' authority scores by assigning each vertex with an * initial hub score of 1.0. The hub scores are initialized to zero since * these will be computed based on the initial authority scores. * * The initial scores are non-normalized. * * @param <T> ID type */ @ForwardedFields("1->0") private static class InitializeScores<T> implements MapFunction<Tuple2<T, T>, Tuple3<T, DoubleValue, DoubleValue>> { private Tuple3<T, DoubleValue, DoubleValue> output = new Tuple3<>(null, new DoubleValue(0.0), new DoubleValue(1.0)); @Override public Tuple3<T, DoubleValue, DoubleValue> map(Tuple2<T, T> value) throws Exception { output.f0 = value.f1; return output; } } /** * Sum vertices' hub and authority scores. * * @param <T> ID type */ @ForwardedFields("0") private static class SumScores<T> implements ReduceFunction<Tuple3<T, DoubleValue, DoubleValue>> { @Override public Tuple3<T, DoubleValue, DoubleValue> reduce(Tuple3<T, DoubleValue, DoubleValue> left, Tuple3<T, DoubleValue, DoubleValue> right) throws Exception { left.f1.setValue(left.f1.getValue() + right.f1.getValue()); left.f2.setValue(left.f2.getValue() + right.f2.getValue()); return left; } } /** * The hub score is the sum of authority scores of vertices on out-edges. * * @param <T> ID type */ @ForwardedFieldsFirst("2->1") @ForwardedFieldsSecond("0") private static class Hubbiness<T> implements CoGroupFunction<Tuple3<T, DoubleValue, DoubleValue>, Tuple2<T, T>, Tuple2<T, DoubleValue>> { private Tuple2<T, DoubleValue> output = new Tuple2<>(); @Override public void coGroup(Iterable<Tuple3<T, DoubleValue, DoubleValue>> vertex, Iterable<Tuple2<T, T>> edges, Collector<Tuple2<T, DoubleValue>> out) throws Exception { output.f1 = vertex.iterator().next().f2; for (Tuple2<T, T> edge : edges) { output.f0 = edge.f0; out.collect(output); } } } /** * The authority score is the sum of hub scores of vertices on in-edges. * * @param <T> ID type */ @ForwardedFieldsFirst("1") @ForwardedFieldsSecond("1->0") private static class Authority<T> implements CoGroupFunction<Tuple2<T, DoubleValue>, Tuple2<T, T>, Tuple2<T, DoubleValue>> { private Tuple2<T, DoubleValue> output = new Tuple2<>(); @Override public void coGroup(Iterable<Tuple2<T, DoubleValue>> vertex, Iterable<Tuple2<T, T>> edges, Collector<Tuple2<T, DoubleValue>> out) throws Exception { output.f1 = vertex.iterator().next().f1; for (Tuple2<T, T> edge : edges) { output.f0 = edge.f1; out.collect(output); } } } /** * Compute the square of each score. * * @param <T> ID type */ private static class Square<T> implements MapFunction<Tuple2<T, DoubleValue>, DoubleValue> { private DoubleValue output = new DoubleValue(); @Override public DoubleValue map(Tuple2<T, DoubleValue> value) throws Exception { double val = value.f1.getValue(); output.setValue(val * val); return output; } } /** * Sum over values. This specialized function is used in place of generic aggregation. */ private static class Sum implements ReduceFunction<DoubleValue> { @Override public DoubleValue reduce(DoubleValue first, DoubleValue second) throws Exception { first.setValue(first.getValue() + second.getValue()); return first; } } /** * Join and normalize the hub and authority scores. * * @param <T> ID type */ @ForwardedFieldsFirst("0") @ForwardedFieldsSecond("0") private static class JoinAndNormalizeHubAndAuthority<T> extends RichJoinFunction<Tuple2<T, DoubleValue>, Tuple2<T, DoubleValue>, Tuple3<T, DoubleValue, DoubleValue>> { private Tuple3<T, DoubleValue, DoubleValue> output = new Tuple3<>(null, new DoubleValue(), new DoubleValue()); private double hubbinessRootSumSquared; private double authorityRootSumSquared; @Override public void open(Configuration parameters) throws Exception { super.open(parameters); Collection<DoubleValue> var; var = getRuntimeContext().getBroadcastVariable(HUBBINESS_SUM_SQUARED); hubbinessRootSumSquared = Math.sqrt(var.iterator().next().getValue()); var = getRuntimeContext().getBroadcastVariable(AUTHORITY_SUM_SQUARED); authorityRootSumSquared = Math.sqrt(var.iterator().next().getValue()); } @Override public Tuple3<T, DoubleValue, DoubleValue> join(Tuple2<T, DoubleValue> hubbiness, Tuple2<T, DoubleValue> authority) throws Exception { output.f0 = (authority == null) ? hubbiness.f0 : authority.f0; output.f1.setValue(hubbiness == null ? 0.0 : hubbiness.f1.getValue() / hubbinessRootSumSquared); output.f2.setValue(authority == null ? 0.0 : authority.f1.getValue() / authorityRootSumSquared); return output; } } /** * Computes the total sum of the change in hub and authority scores over * all vertices between iterations. A negative score is emitted after the * first iteration to prevent premature convergence. * * @param <T> ID type */ @ForwardedFieldsFirst("0") @ForwardedFieldsSecond("*") private static class ChangeInScores<T> extends RichJoinFunction<Tuple3<T, DoubleValue, DoubleValue>, Tuple3<T, DoubleValue, DoubleValue>, Tuple3<T, DoubleValue, DoubleValue>> { private boolean isInitialSuperstep; private double changeInScores; @Override public void open(Configuration parameters) throws Exception { super.open(parameters); isInitialSuperstep = (getIterationRuntimeContext().getSuperstepNumber() == 1); changeInScores = (isInitialSuperstep) ? -1.0 : 0.0; } @Override public void close() throws Exception { super.close(); DoubleSumAggregator agg = getIterationRuntimeContext().getIterationAggregator(CHANGE_IN_SCORES); agg.aggregate(changeInScores); } @Override public Tuple3<T, DoubleValue, DoubleValue> join(Tuple3<T, DoubleValue, DoubleValue> first, Tuple3<T, DoubleValue, DoubleValue> second) throws Exception { if (! isInitialSuperstep) { changeInScores += Math.abs(second.f1.getValue() - first.f1.getValue()); changeInScores += Math.abs(second.f2.getValue() - first.f2.getValue()); } return second; } } /** * Monitors the total change in hub and authority scores over all vertices. * The algorithm terminates when the change in scores compared against the * prior iteration falls to or below the given convergence threshold. * * An optimization of this implementation of HITS is to leave the initial * scores non-normalized; therefore, the change in scores after the first * superstep cannot be measured and a negative value is emitted to signal * that the iteration should continue. */ private static class ScoreConvergence implements ConvergenceCriterion<DoubleValue> { private double convergenceThreshold; public ScoreConvergence(double convergenceThreshold) { this.convergenceThreshold = convergenceThreshold; } @Override public boolean isConverged(int iteration, DoubleValue value) { double val = value.getValue(); return (0 <= val && val <= convergenceThreshold); } } /** * Map the Tuple result to the return type. * * @param <T> ID type */ @ForwardedFields("0; 1; 2") private static class TranslateResult<T> implements MapFunction<Tuple3<T, DoubleValue, DoubleValue>, Result<T>> { private Result<T> output = new Result<>(); @Override public Result<T> map(Tuple3<T, DoubleValue, DoubleValue> value) throws Exception { output.f0 = value.f0; output.f1 = value.f1; output.f2 = value.f2; return output; } } /** * Wraps the {@link Tuple3} to encapsulate results from the HITS algorithm. * * @param <T> ID type */ public static class Result<T> extends Tuple3<T, DoubleValue, DoubleValue> implements PrintableResult, UnaryResult<T> { public static final int HASH_SEED = 0xc7e39a63; private Murmur3_32 hasher = new Murmur3_32(HASH_SEED); @Override public T getVertexId0() { return f0; } @Override public void setVertexId0(T value) { f0 = value; } /** * Get the hub score. Good hubs link to good authorities. * * @return the hub score */ public DoubleValue getHubScore() { return f1; } /** * Get the authority score. Good authorities link to good hubs. * * @return the authority score */ public DoubleValue getAuthorityScore() { return f2; } public String toPrintableString() { return "Vertex ID: " + getVertexId0() + ", hub score: " + getHubScore() + ", authority score: " + getAuthorityScore(); } @Override public int hashCode() { return hasher.reset() .hash(f0.hashCode()) .hash(f1.getValue()) .hash(f2.getValue()) .hash(); } } }