/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.graph.library.link_analysis;
import org.apache.flink.api.common.aggregators.ConvergenceCriterion;
import org.apache.flink.api.common.aggregators.DoubleSumAggregator;
import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.functions.RichJoinFunction;
import org.apache.flink.api.common.operators.base.JoinOperatorBase.JoinHint;
import org.apache.flink.api.common.operators.base.ReduceOperatorBase.CombineHint;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.functions.FunctionAnnotation.ForwardedFields;
import org.apache.flink.api.java.functions.FunctionAnnotation.ForwardedFieldsFirst;
import org.apache.flink.api.java.functions.FunctionAnnotation.ForwardedFieldsSecond;
import org.apache.flink.api.java.operators.IterativeDataSet;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.graph.Edge;
import org.apache.flink.graph.Graph;
import org.apache.flink.graph.asm.result.PrintableResult;
import org.apache.flink.graph.asm.result.UnaryResult;
import org.apache.flink.graph.library.link_analysis.Functions.SumScore;
import org.apache.flink.graph.library.link_analysis.HITS.Result;
import org.apache.flink.graph.utils.Murmur3_32;
import org.apache.flink.graph.utils.proxy.GraphAlgorithmWrappingDataSet;
import org.apache.flink.types.DoubleValue;
import org.apache.flink.util.Collector;
import org.apache.flink.util.Preconditions;
import java.util.Collection;
import static org.apache.flink.api.common.ExecutionConfig.PARALLELISM_DEFAULT;
/**
* Hyperlink-Induced Topic Search computes two interdependent scores for every
* vertex in a directed graph. A good "hub" links to good "authorities" and
* good "authorities" are linked from good "hubs".
* <p>
* This algorithm can be configured to terminate either by a limit on the number
* of iterations, a convergence threshold, or both.
* <p>
* http://www.cs.cornell.edu/home/kleinber/auth.pdf
*
* @param <K> graph ID type
* @param <VV> vertex value type
* @param <EV> edge value type
*/
public class HITS<K, VV, EV>
extends GraphAlgorithmWrappingDataSet<K, VV, EV, Result<K>> {
private static final String CHANGE_IN_SCORES = "change in scores";
private static final String HUBBINESS_SUM_SQUARED = "hubbiness sum squared";
private static final String AUTHORITY_SUM_SQUARED = "authority sum squared";
// Required configuration
private int maxIterations;
private double convergenceThreshold;
// Optional configuration
private int parallelism = PARALLELISM_DEFAULT;
/**
* Hyperlink-Induced Topic Search with a fixed number of iterations.
*
* @param iterations fixed number of iterations
*/
public HITS(int iterations) {
this(iterations, Double.MAX_VALUE);
}
/**
* Hyperlink-Induced Topic Search with a convergence threshold. The algorithm
* terminates when the total change in hub and authority scores over all
* vertices falls to or below the given threshold value.
*
* @param convergenceThreshold convergence threshold for sum of scores
*/
public HITS(double convergenceThreshold) {
this(Integer.MAX_VALUE, convergenceThreshold);
}
/**
* Hyperlink-Induced Topic Search with a convergence threshold and a maximum
* iteration count. The algorithm terminates after either the given number
* of iterations or when the total change in hub and authority scores over all
* vertices falls to or below the given threshold value.
*
* @param maxIterations maximum number of iterations
* @param convergenceThreshold convergence threshold for sum of scores
*/
public HITS(int maxIterations, double convergenceThreshold) {
Preconditions.checkArgument(maxIterations > 0, "Number of iterations must be greater than zero");
Preconditions.checkArgument(convergenceThreshold > 0.0, "Convergence threshold must be greater than zero");
this.maxIterations = maxIterations;
this.convergenceThreshold = convergenceThreshold;
}
/**
* Override the operator parallelism.
*
* @param parallelism operator parallelism
* @return this
*/
public HITS<K, VV, EV> setParallelism(int parallelism) {
this.parallelism = parallelism;
return this;
}
@Override
protected String getAlgorithmName() {
return HITS.class.getName();
}
@Override
protected boolean mergeConfiguration(GraphAlgorithmWrappingDataSet other) {
Preconditions.checkNotNull(other);
if (! HITS.class.isAssignableFrom(other.getClass())) {
return false;
}
HITS rhs = (HITS) other;
// merge configurations
maxIterations = Math.max(maxIterations, rhs.maxIterations);
convergenceThreshold = Math.min(convergenceThreshold, rhs.convergenceThreshold);
parallelism = (parallelism == PARALLELISM_DEFAULT) ? rhs.parallelism :
((rhs.parallelism == PARALLELISM_DEFAULT) ? parallelism : Math.min(parallelism, rhs.parallelism));
return true;
}
@Override
public DataSet<Result<K>> runInternal(Graph<K, VV, EV> input)
throws Exception {
DataSet<Tuple2<K, K>> edges = input
.getEdges()
.map(new ExtractEdgeIDs<K, EV>())
.setParallelism(parallelism)
.name("Extract edge IDs");
// ID, hub, authority
DataSet<Tuple3<K, DoubleValue, DoubleValue>> initialScores = edges
.map(new InitializeScores<K>())
.setParallelism(parallelism)
.name("Initial scores")
.groupBy(0)
.reduce(new SumScores<K>())
.setCombineHint(CombineHint.HASH)
.setParallelism(parallelism)
.name("Sum");
IterativeDataSet<Tuple3<K, DoubleValue, DoubleValue>> iterative = initialScores
.iterate(maxIterations);
// ID, hubbiness
DataSet<Tuple2<K, DoubleValue>> hubbiness = iterative
.coGroup(edges)
.where(0)
.equalTo(1)
.with(new Hubbiness<K>())
.setParallelism(parallelism)
.name("Hub")
.groupBy(0)
.reduce(new SumScore<K>())
.setCombineHint(CombineHint.HASH)
.setParallelism(parallelism)
.name("Sum");
// sum-of-hubbiness-squared
DataSet<DoubleValue> hubbinessSumSquared = hubbiness
.map(new Square<K>())
.setParallelism(parallelism)
.name("Square")
.reduce(new Sum())
.setCombineHint(CombineHint.HASH)
.setParallelism(parallelism)
.name("Sum");
// ID, new authority
DataSet<Tuple2<K, DoubleValue>> authority = hubbiness
.coGroup(edges)
.where(0)
.equalTo(0)
.with(new Authority<K>())
.setParallelism(parallelism)
.name("Authority")
.groupBy(0)
.reduce(new SumScore<K>())
.setCombineHint(CombineHint.HASH)
.setParallelism(parallelism)
.name("Sum");
// sum-of-authority-squared
DataSet<DoubleValue> authoritySumSquared = authority
.map(new Square<K>())
.setParallelism(parallelism)
.name("Square")
.reduce(new Sum())
.setCombineHint(CombineHint.HASH)
.setParallelism(parallelism)
.name("Sum");
// ID, normalized hubbiness, normalized authority
DataSet<Tuple3<K, DoubleValue, DoubleValue>> scores = hubbiness
.fullOuterJoin(authority, JoinHint.REPARTITION_SORT_MERGE)
.where(0)
.equalTo(0)
.with(new JoinAndNormalizeHubAndAuthority<K>())
.withBroadcastSet(hubbinessSumSquared, HUBBINESS_SUM_SQUARED)
.withBroadcastSet(authoritySumSquared, AUTHORITY_SUM_SQUARED)
.setParallelism(parallelism)
.name("Join scores");
DataSet<Tuple3<K, DoubleValue, DoubleValue>> passThrough;
if (convergenceThreshold < Double.MAX_VALUE) {
passThrough = iterative
.fullOuterJoin(scores, JoinHint.REPARTITION_SORT_MERGE)
.where(0)
.equalTo(0)
.with(new ChangeInScores<K>())
.setParallelism(parallelism)
.name("Change in scores");
iterative.registerAggregationConvergenceCriterion(CHANGE_IN_SCORES, new DoubleSumAggregator(), new ScoreConvergence(convergenceThreshold));
} else {
passThrough = scores;
}
return iterative
.closeWith(passThrough)
.map(new TranslateResult<K>())
.setParallelism(parallelism)
.name("Map result");
}
/**
* Map edges and remove the edge value.
*
* @param <T> ID type
* @param <ET> edge value type
*
* @see Graph.ExtractEdgeIDsMapper
*/
@ForwardedFields("0; 1")
private static class ExtractEdgeIDs<T, ET>
implements MapFunction<Edge<T, ET>, Tuple2<T, T>> {
private Tuple2<T, T> output = new Tuple2<>();
@Override
public Tuple2<T, T> map(Edge<T, ET> value)
throws Exception {
output.f0 = value.f0;
output.f1 = value.f1;
return output;
}
}
/**
* Initialize vertices' authority scores by assigning each vertex with an
* initial hub score of 1.0. The hub scores are initialized to zero since
* these will be computed based on the initial authority scores.
*
* The initial scores are non-normalized.
*
* @param <T> ID type
*/
@ForwardedFields("1->0")
private static class InitializeScores<T>
implements MapFunction<Tuple2<T, T>, Tuple3<T, DoubleValue, DoubleValue>> {
private Tuple3<T, DoubleValue, DoubleValue> output = new Tuple3<>(null, new DoubleValue(0.0), new DoubleValue(1.0));
@Override
public Tuple3<T, DoubleValue, DoubleValue> map(Tuple2<T, T> value) throws Exception {
output.f0 = value.f1;
return output;
}
}
/**
* Sum vertices' hub and authority scores.
*
* @param <T> ID type
*/
@ForwardedFields("0")
private static class SumScores<T>
implements ReduceFunction<Tuple3<T, DoubleValue, DoubleValue>> {
@Override
public Tuple3<T, DoubleValue, DoubleValue> reduce(Tuple3<T, DoubleValue, DoubleValue> left, Tuple3<T, DoubleValue, DoubleValue> right)
throws Exception {
left.f1.setValue(left.f1.getValue() + right.f1.getValue());
left.f2.setValue(left.f2.getValue() + right.f2.getValue());
return left;
}
}
/**
* The hub score is the sum of authority scores of vertices on out-edges.
*
* @param <T> ID type
*/
@ForwardedFieldsFirst("2->1")
@ForwardedFieldsSecond("0")
private static class Hubbiness<T>
implements CoGroupFunction<Tuple3<T, DoubleValue, DoubleValue>, Tuple2<T, T>, Tuple2<T, DoubleValue>> {
private Tuple2<T, DoubleValue> output = new Tuple2<>();
@Override
public void coGroup(Iterable<Tuple3<T, DoubleValue, DoubleValue>> vertex, Iterable<Tuple2<T, T>> edges, Collector<Tuple2<T, DoubleValue>> out)
throws Exception {
output.f1 = vertex.iterator().next().f2;
for (Tuple2<T, T> edge : edges) {
output.f0 = edge.f0;
out.collect(output);
}
}
}
/**
* The authority score is the sum of hub scores of vertices on in-edges.
*
* @param <T> ID type
*/
@ForwardedFieldsFirst("1")
@ForwardedFieldsSecond("1->0")
private static class Authority<T>
implements CoGroupFunction<Tuple2<T, DoubleValue>, Tuple2<T, T>, Tuple2<T, DoubleValue>> {
private Tuple2<T, DoubleValue> output = new Tuple2<>();
@Override
public void coGroup(Iterable<Tuple2<T, DoubleValue>> vertex, Iterable<Tuple2<T, T>> edges, Collector<Tuple2<T, DoubleValue>> out)
throws Exception {
output.f1 = vertex.iterator().next().f1;
for (Tuple2<T, T> edge : edges) {
output.f0 = edge.f1;
out.collect(output);
}
}
}
/**
* Compute the square of each score.
*
* @param <T> ID type
*/
private static class Square<T>
implements MapFunction<Tuple2<T, DoubleValue>, DoubleValue> {
private DoubleValue output = new DoubleValue();
@Override
public DoubleValue map(Tuple2<T, DoubleValue> value)
throws Exception {
double val = value.f1.getValue();
output.setValue(val * val);
return output;
}
}
/**
* Sum over values. This specialized function is used in place of generic aggregation.
*/
private static class Sum
implements ReduceFunction<DoubleValue> {
@Override
public DoubleValue reduce(DoubleValue first, DoubleValue second)
throws Exception {
first.setValue(first.getValue() + second.getValue());
return first;
}
}
/**
* Join and normalize the hub and authority scores.
*
* @param <T> ID type
*/
@ForwardedFieldsFirst("0")
@ForwardedFieldsSecond("0")
private static class JoinAndNormalizeHubAndAuthority<T>
extends RichJoinFunction<Tuple2<T, DoubleValue>, Tuple2<T, DoubleValue>, Tuple3<T, DoubleValue, DoubleValue>> {
private Tuple3<T, DoubleValue, DoubleValue> output = new Tuple3<>(null, new DoubleValue(), new DoubleValue());
private double hubbinessRootSumSquared;
private double authorityRootSumSquared;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
Collection<DoubleValue> var;
var = getRuntimeContext().getBroadcastVariable(HUBBINESS_SUM_SQUARED);
hubbinessRootSumSquared = Math.sqrt(var.iterator().next().getValue());
var = getRuntimeContext().getBroadcastVariable(AUTHORITY_SUM_SQUARED);
authorityRootSumSquared = Math.sqrt(var.iterator().next().getValue());
}
@Override
public Tuple3<T, DoubleValue, DoubleValue> join(Tuple2<T, DoubleValue> hubbiness, Tuple2<T, DoubleValue> authority)
throws Exception {
output.f0 = (authority == null) ? hubbiness.f0 : authority.f0;
output.f1.setValue(hubbiness == null ? 0.0 : hubbiness.f1.getValue() / hubbinessRootSumSquared);
output.f2.setValue(authority == null ? 0.0 : authority.f1.getValue() / authorityRootSumSquared);
return output;
}
}
/**
* Computes the total sum of the change in hub and authority scores over
* all vertices between iterations. A negative score is emitted after the
* first iteration to prevent premature convergence.
*
* @param <T> ID type
*/
@ForwardedFieldsFirst("0")
@ForwardedFieldsSecond("*")
private static class ChangeInScores<T>
extends RichJoinFunction<Tuple3<T, DoubleValue, DoubleValue>, Tuple3<T, DoubleValue, DoubleValue>, Tuple3<T, DoubleValue, DoubleValue>> {
private boolean isInitialSuperstep;
private double changeInScores;
@Override
public void open(Configuration parameters)
throws Exception {
super.open(parameters);
isInitialSuperstep = (getIterationRuntimeContext().getSuperstepNumber() == 1);
changeInScores = (isInitialSuperstep) ? -1.0 : 0.0;
}
@Override
public void close()
throws Exception {
super.close();
DoubleSumAggregator agg = getIterationRuntimeContext().getIterationAggregator(CHANGE_IN_SCORES);
agg.aggregate(changeInScores);
}
@Override
public Tuple3<T, DoubleValue, DoubleValue> join(Tuple3<T, DoubleValue, DoubleValue> first, Tuple3<T, DoubleValue, DoubleValue> second)
throws Exception {
if (! isInitialSuperstep) {
changeInScores += Math.abs(second.f1.getValue() - first.f1.getValue());
changeInScores += Math.abs(second.f2.getValue() - first.f2.getValue());
}
return second;
}
}
/**
* Monitors the total change in hub and authority scores over all vertices.
* The algorithm terminates when the change in scores compared against the
* prior iteration falls to or below the given convergence threshold.
*
* An optimization of this implementation of HITS is to leave the initial
* scores non-normalized; therefore, the change in scores after the first
* superstep cannot be measured and a negative value is emitted to signal
* that the iteration should continue.
*/
private static class ScoreConvergence
implements ConvergenceCriterion<DoubleValue> {
private double convergenceThreshold;
public ScoreConvergence(double convergenceThreshold) {
this.convergenceThreshold = convergenceThreshold;
}
@Override
public boolean isConverged(int iteration, DoubleValue value) {
double val = value.getValue();
return (0 <= val && val <= convergenceThreshold);
}
}
/**
* Map the Tuple result to the return type.
*
* @param <T> ID type
*/
@ForwardedFields("0; 1; 2")
private static class TranslateResult<T>
implements MapFunction<Tuple3<T, DoubleValue, DoubleValue>, Result<T>> {
private Result<T> output = new Result<>();
@Override
public Result<T> map(Tuple3<T, DoubleValue, DoubleValue> value) throws Exception {
output.f0 = value.f0;
output.f1 = value.f1;
output.f2 = value.f2;
return output;
}
}
/**
* Wraps the {@link Tuple3} to encapsulate results from the HITS algorithm.
*
* @param <T> ID type
*/
public static class Result<T>
extends Tuple3<T, DoubleValue, DoubleValue>
implements PrintableResult, UnaryResult<T> {
public static final int HASH_SEED = 0xc7e39a63;
private Murmur3_32 hasher = new Murmur3_32(HASH_SEED);
@Override
public T getVertexId0() {
return f0;
}
@Override
public void setVertexId0(T value) {
f0 = value;
}
/**
* Get the hub score. Good hubs link to good authorities.
*
* @return the hub score
*/
public DoubleValue getHubScore() {
return f1;
}
/**
* Get the authority score. Good authorities link to good hubs.
*
* @return the authority score
*/
public DoubleValue getAuthorityScore() {
return f2;
}
public String toPrintableString() {
return "Vertex ID: " + getVertexId0()
+ ", hub score: " + getHubScore()
+ ", authority score: " + getAuthorityScore();
}
@Override
public int hashCode() {
return hasher.reset()
.hash(f0.hashCode())
.hash(f1.getValue())
.hash(f2.getValue())
.hash();
}
}
}