/*
* File: PersonalizedPageRank.java
* Authors: Jeremy D. Wendt
* Company: Sandia National Laboratories
* Project: Cognitive Foundry
*
* Copyright 2016, Sandia Corporation.
* Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive
* license for use of this work by or on behalf of the U.S. Government.
* Export of this program may require a license from the United States
* Government. See CopyrightHistory.txt for complete details.
*
*/
package gov.sandia.cognition.graph.community;
import gov.sandia.cognition.annotation.PublicationReference;
import gov.sandia.cognition.annotation.PublicationType;
import gov.sandia.cognition.graph.DirectedNodeEdgeGraph;
import gov.sandia.cognition.util.DefaultKeyValuePair;
import gov.sandia.cognition.collection.DoubleArrayList;
import gov.sandia.cognition.collection.IntArrayList;
import gov.sandia.cognition.util.Pair;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.Random;
import java.util.Set;
/**
* This class can compute PersonalizedPageRank for the input graph and a
* specified node, and also can determine a community for any specified node.
* The code permits multiple queries against the same graph, storing the
* necessary speed-up objects as part of the instance.
*
* @author jdwendt
* @param <NodeNameType> The graph's node name type
*/
@PublicationReference(type = PublicationType.WebPage, title
= "Personalized PageRank code", author = "dgleich", year = 2016, url
= "https://gist.github.com/dgleich/6201856")
public class PersonalizedPageRank<NodeNameType>
{
/**
* Yale-format-like representation of the neighbors of each node (see
* http://en.wikipedia.org/wiki/Sparse_matrix#Yale_format). This contains
* the ids of all neighbors of all nodes in node-order. To figure out a
* specific node's neighbors, look from indices neighborsFirstIdx.get(i) to
* neighborsFirstIdx.get(i+1).
*/
private final IntArrayList neighbors;
/**
* Yale-format-like representation of the neighbors of each node (see
* http://en.wikipedia.org/wiki/Sparse_matrix#Yale_format). This specifies
* the index of the first neighbor in the neighbors list.
*/
private final IntArrayList neighborsFirstIdx;
/**
* Yale-format-like representation of the neighbors of each node (see
* http://en.wikipedia.org/wiki/Sparse_matrix#Yale_format). This contains
* the weights of all neighbors of all nodes in node-order. Follows the same
* order as IntVector neighbors.
*/
private final DoubleArrayList neighborsWeights;
/**
* Stores the weighted degree of each node in the graph
*/
private final DoubleArrayList nodeWeightedDegree;
/**
* Stores a copy of the graph for some of the translation capabilities, etc.
*/
private final DirectedNodeEdgeGraph<NodeNameType> graph;
/**
* Stores the total weight of the edges in the graph times two (as each edge
* is counted for both directions)
*/
private final double gVol;
/**
* The random number generator for this instance
*/
private Random generator;
/**
* The tolerance for spreading PPR further
*/
private double pprTolerance;
/**
* Initializes all of the internal data-structures for the input graph. Note
* that if the input graph is altered after passing it to this class, the
* results for this instance can become unstable.
*
* @param graph The graph to compute personalized page rank for
*/
public PersonalizedPageRank(DirectedNodeEdgeGraph<NodeNameType> graph)
{
this(graph, 0.01);
}
/**
* Initializes all of the internal data-structures for the input graph. Note
* that if the input graph is altered after passing it to this class, the
* results for this instance can become unstable.
*
* @param graph The graph to compute personalized page rank for
* @param tolerance The tolerance for further spreading PPR. Should be
* fairly small 0.01 or smaller. The closer to 0, the further it will
* spread. Setting to 0 could lead to never quite converging (so probably
* don't do that).
*/
public PersonalizedPageRank(DirectedNodeEdgeGraph<NodeNameType> graph,
double tolerance)
{
this.pprTolerance = tolerance;
YaleFormatWeightedNeighbors<NodeNameType> neigh
= new YaleFormatWeightedNeighbors<>(graph, true);
this.neighbors = neigh.getNeighbors();
this.neighborsFirstIdx = neigh.getNeighborsFirstIndex();
this.neighborsWeights = neigh.getNeighborsWeights();
this.graph = graph;
this.nodeWeightedDegree = new DoubleArrayList(graph.getNumNodes());
double tmpgVol = 0;
for (int i = 0; i < graph.getNumNodes(); ++i)
{
this.nodeWeightedDegree.add(0.0);
for (int j = this.neighborsFirstIdx.get(i); j
< this.neighborsFirstIdx.get(i + 1); ++j)
{
this.nodeWeightedDegree.plusEquals(i, this.neighborsWeights.get(
j));
tmpgVol += this.neighborsWeights.get(j);
}
}
this.gVol = tmpgVol;
this.generator = new Random();
}
/**
* Set the tolerance to a new value. Should be small (0, 0.01]. The closer
* to 0, the further it will spread. Setting to 0 could lead to never quite
* converging (so probably don't do that).
*
* @param tolerance The tolerance for further spreading PPR.
*/
public void setTolerance(double tolerance)
{
pprTolerance = tolerance;
}
/**
* Initialize the random number generator with the input seed.
*
* @param seed The seed for the random number generator
*/
public void setRandomSet(long seed)
{
generator = new Random(seed);
}
/**
* Returns the vector of all scores for all nodes in the graph (order
* determined by node order as stored in the graph)
*
* @param node The node to use as seed
* @return the vector of all scores for all nodes in the graph
*/
public DoubleArrayList getScoresForAllNodes(NodeNameType node)
{
return getScoresForAllNodesById(graph.getNodeId(node));
}
/**
* Returns the vector of all scores for all nodes in the graph (order
* determined by node order as stored in the graph)
*
* @param nodeIdx The node index to use as seed
* @return the vector of all scores for all nodes in the graph
*/
public DoubleArrayList getScoresForAllNodesById(int nodeIdx)
{
return getScoresForAllNodesByIds(Collections.singletonList(nodeIdx),
false);
}
/**
* Returns the vector of all scores for all nodes in the graph (order
* determined by node order as stored in the graph) as specified for the
* input seeds
*
* @param nodes The nodes to use as seed
* @return the vector of all scores for all nodes in the graph
*/
public DoubleArrayList getScoresForAllNodes(List<NodeNameType> nodes)
{
return getScoresForAllNodesByIds(convertToIds(nodes), false);
}
/**
* Returns the vector of all scores for all nodes in the graph (order
* determined by node order as stored in the graph)
*
* @param node The node to use as seed
* @param randomized If true, the order nodes are treated within the
* algorithm will be randomized so that you can get an average result across
* multiple runs or just realize that any single run is not the completely
* true answer.
* @return the vector of all scores for all nodes in the graph
*/
public DoubleArrayList getScoresForAllNodes(NodeNameType node,
boolean randomized)
{
return getScoresForAllNodesById(graph.getNodeId(node), randomized);
}
/**
* Returns the vector of all scores for all nodes in the graph (order
* determined by node order as stored in the graph)
*
* @param nodeIdx The node index to use as seed
* @param randomized If true, the order nodes are treated within the
* algorithm will be randomized so that you can get an average result across
* multiple runs or just realize that any single run is not the completely
* true answer.
* @return the vector of all scores for all nodes in the graph
*/
public DoubleArrayList getScoresForAllNodesById(int nodeIdx,
boolean randomized)
{
return getScoresForAllNodesByIds(Collections.singletonList(nodeIdx),
randomized);
}
/**
* Returns the vector of all scores for all nodes in the graph (order
* determined by node order as stored in the graph) as specified for the
* input seeds
*
* @param nodes The nodes to use as seed
* @param randomized If true, the order nodes are treated within the
* algorithm will be randomized so that you can get an average result across
* multiple runs or just realize that any single run is not the completely
* true answer.
* @return the vector of all scores for all nodes in the graph
*/
public DoubleArrayList getScoresForAllNodes(List<NodeNameType> nodes,
boolean randomized)
{
return getScoresForAllNodesByIds(convertToIds(nodes), randomized);
}
/**
* Returns the vector of all scores for all nodes in the graph (order
* determined by node order as stored in the graph) as specified for the
* input seed indices
*
* @param nodeIdxs The node indices to use as seed
* @return the vector of all scores for all nodes in the graph
*/
public DoubleArrayList getScoresForAllNodesByIds(List<Integer> nodeIdxs)
{
return getScoresForAllNodesByIds(nodeIdxs, false);
}
/**
* Returns the vector of all scores for all nodes in the graph (order
* determined by node order as stored in the graph) as specified for the
* input seed indices
*
* @param nodeIdxs The node indices to use as seed
* @param randomized If true, the order nodes are treated within the
* algorithm will be randomized so that you can get an average result across
* multiple runs or just realize that any single run is not the completely
* true answer.
* @return the vector of all scores for all nodes in the graph
*/
public DoubleArrayList getScoresForAllNodesByIds(List<Integer> nodeIdxs,
boolean randomized)
{
final double ALPHA = 0.99;
final double TOL = pprTolerance;
Queue<Integer> fifo = new LinkedList<>();
DoubleArrayList residual = new DoubleArrayList(graph.getNumNodes());
DoubleArrayList x = new DoubleArrayList(graph.getNumNodes());
for (int i = 0; i < graph.getNumNodes(); ++i)
{
residual.add(0.0);
x.add(0.0);
}
double init = 1.0 / nodeIdxs.size();
for (int n : nodeIdxs)
{
residual.set(n, init);
fifo.add(n);
}
while (!fifo.isEmpty())
{
int v = fifo.remove();
x.plusEquals(v, (1 - ALPHA) * residual.get(v));
double mass = ALPHA * residual.get(v) / (2 * nodeWeightedDegree.get(
v));
IntArrayList range = IntArrayList.range(neighborsFirstIdx.get(v),
neighborsFirstIdx.get(v + 1));
if (randomized)
{
range.randomizeOrder(generator);
}
for (int m = 0; m < range.size(); ++m)
{
int i = range.get(m);
int u = neighbors.get(i);
if (u == v)
{
throw new RuntimeException(
"This line should be unreachable.");
}
// The first part of the if insures u is not already in the queue
if ((residual.get(u) < nodeWeightedDegree.get(u) * TOL)
&& (residual.get(u) + mass * neighborsWeights.get(i)
>= nodeWeightedDegree.get(u) * TOL))
{
fifo.add(u);
}
residual.plusEquals(u, mass * neighborsWeights.get(i));
}
residual.set(v, mass * nodeWeightedDegree.get(v));
if (residual.get(v) >= nodeWeightedDegree.get(v) * TOL)
{
fifo.add(v);
}
}
return x;
}
/**
* Returns the vector of all scores for all nodes in the graph (order
* determined by node order as stored in the graph) as specified for the
* input seed. This implementation uses the random version and averages over
* numRuns random runs.
*
* @param node The seed node to consider
* @param numRuns The number of runs to perform
* @return the vector of all scores for all nodes in the graph
*/
public DoubleArrayList getScoresForAllNodesMultirun(NodeNameType node,
int numRuns)
{
return getScoresForAllNodesByIdMultirun(graph.getNodeId(node), numRuns);
}
/**
* Returns the vector of all scores for all nodes in the graph (order
* determined by node order as stored in the graph) as specified for the
* input seed index. This implementation uses the random version and
* averages over numRuns random runs.
*
* @param nodes The seed nodes
* @param numRuns The number of runs to perform
* @return the vector of all scores for all nodes in the graph
*/
public DoubleArrayList getScoresForAllNodesMultirun(List<NodeNameType> nodes,
int numRuns)
{
return getScoresForAllNodesByIdMultirun(convertToIds(nodes), numRuns);
}
/**
* Returns the vector of all scores for all nodes in the graph (order
* determined by node order as stored in the graph) as specified for the
* input seed index. This implementation uses the random version and
* averages over numRuns random runs.
*
* @param nodeIdx The seed node's id
* @param numRuns The number of runs to perform
* @return the vector of all scores for all nodes in the graph
*/
public DoubleArrayList getScoresForAllNodesByIdMultirun(int nodeIdx,
int numRuns)
{
return getScoresForAllNodesByIdMultirun(Collections.singletonList(
nodeIdx), numRuns);
}
/**
* Returns the vector of all scores for all nodes in the graph (order
* determined by node order as stored in the graph) as specified for the
* input seed index. This implementation uses the random version and
* averages over numRuns random runs.
*
* @param nodeIdxs The seed nodes' id
* @param numRuns The number of runs to perform
* @return the vector of all scores for all nodes in the graph
*/
public DoubleArrayList getScoresForAllNodesByIdMultirun(List<Integer> nodeIdxs,
int numRuns)
{
DoubleArrayList x = DoubleArrayList.zeros(graph.getNumNodes());
for (int i = 0; i < numRuns; ++i)
{
DoubleArrayList tmp = getScoresForAllNodesByIds(nodeIdxs, true);
for (int j = 0; j < tmp.size(); ++j)
{
x.plusEquals(j, tmp.get(j));
}
}
double scalar = 1.0 / numRuns;
for (int j = 0; j < x.size(); ++j)
{
x.set(j, x.get(j) * scalar);
}
return x;
}
/**
* Computes the best community for the input node id by personalized page
* rank scores and conductance of cut. Note both the PPR computation and the
* community-via-conducntance computation contain order-dependence.
* Therefore, this code always uses a random ordering. For PPR, it returns
* the average PPR across all of the runs. For the community determination,
* it returns the community with the best conductance based as found by
* following edges in different orders.
*
* @param node The node whose community is desired
* @param numRunsPpr The number of randomized runs to perform for the PPR
* computation
* @param numRunsCut The number of randomized runs to perform for community
* determination
* @return
*/
public Set<Integer> getCommunityForNode(NodeNameType node,
int numRunsPpr,
int numRunsCut)
{
return getCommunityForNodeById(graph.getNodeId(node), numRunsPpr,
numRunsCut);
}
/**
* Computes the best community for the input node id by personalized page
* rank scores and conductance of cut. Note both the PPR computation and the
* community-via-conducntance computation contain order-dependence.
* Therefore, this code always uses a random ordering. For PPR, it returns
* the average PPR across all of the runs. For the community determination,
* it returns the community with the best conductance based as found by
* following edges in different orders.
*
* @param nodeIdx The node id whose community is desired
* @param numRunsPpr The number of randomized runs to perform for the PPR
* computation
* @param numRunsCut The number of randomized runs to perform for community
* determination
* @return
*/
public Set<Integer> getCommunityForNodeById(int nodeIdx,
int numRunsPpr,
int numRunsCut)
{
return getCommunityForNodesById(Collections.singletonList(nodeIdx),
numRunsPpr, numRunsCut);
}
/**
* Computes the best community for the input node id by personalized page
* rank scores and conductance of cut. Note both the PPR computation and the
* community-via-conducntance computation contain order-dependence.
* Therefore, this code always uses a random ordering. For PPR, it returns
* the average PPR across all of the runs. For the community determination,
* it returns the community with the best conductance based as found by
* following edges in different orders.
*
* @param nodes The node(s) whose community is desired
* @param numRunsPpr The number of randomized runs to perform for the PPR
* computation
* @param numRunsCut The number of randomized runs to perform for community
* determination
* @return
*/
public Set<Integer> getCommunityForNodes(List<NodeNameType> nodes,
int numRunsPpr,
int numRunsCut)
{
return getCommunityForNodesById(convertToIds(nodes), numRunsPpr,
numRunsCut);
}
/**
* Private helper that converts a list of node names to a list of node ids
*
* @param nodes The node names to convert
* @return the node ids
*/
private List<Integer> convertToIds(List<NodeNameType> nodes)
{
List<Integer> nodeIds = new ArrayList<>(nodes.size());
for (NodeNameType node : nodes)
{
nodeIds.add(graph.getNodeId(node));
}
return nodeIds;
}
/**
* Computes the best community for the input node id by personalized page
* rank scores and conductance of cut. Note both the PPR computation and the
* community-via-conducntance computation contain order-dependence.
* Therefore, this code always uses a random ordering. For PPR, it returns
* the average PPR across all of the runs. For the community determination,
* it returns the community with the best conductance based as found by
* following edges in different orders.
*
* @param nodeIdxs The node id(s) whose community is desired
* @param numRunsPpr The number of randomized runs to perform for the PPR
* computation
* @param numRunsCut The number of randomized runs to perform for community
* determination
* @return
*/
public Set<Integer> getCommunityForNodesById(List<Integer> nodeIdxs,
int numRunsPpr,
int numRunsCut)
{
DoubleArrayList x = getScoresForAllNodesByIdMultirun(nodeIdxs, numRunsPpr);
for (int i = 0; i < graph.getNumNodes(); ++i)
{
x.set(i, x.get(i) / nodeWeightedDegree.get(i));
}
List<Pair<Integer, Double>> sorted = new ArrayList<>(x.size());
for (int i = 0; i < x.size(); ++i)
{
// Don't add unvisited nodes
if (x.get(i) == 0)
{
continue;
}
sorted.add(new DefaultKeyValuePair<>(i, x.get(i)));
}
Collections.sort(sorted, new Comparator<Pair<Integer, Double>>()
{
@Override
public int compare(Pair<Integer, Double> o1,
Pair<Integer, Double> o2)
{
return Double.compare(o2.getSecond(), o1.getSecond());
}
});
double bestCond = Double.MAX_VALUE;
Set<Integer> bestSet = null;
for (int r = 0; r < numRunsCut; ++r)
{
double volS = 0.0;
double cutS = 0.0;
Set<Integer> setToAddTo = new HashSet<>(x.size());
for (int i = 0; i < sorted.size(); ++i)
{
int idx = sorted.get(i).getFirst();
volS += nodeWeightedDegree.get(idx);
IntArrayList loop = IntArrayList.range(neighborsFirstIdx.get(idx),
neighborsFirstIdx.get(idx + 1));
loop.randomizeOrder(generator);
for (int s = 0; s < loop.size(); ++s)
{
int j = loop.get(s);
if (setToAddTo.contains(neighbors.get(j)))
{
cutS -= neighborsWeights.get(j);
}
else
{
cutS += neighborsWeights.get(j);
}
}
setToAddTo.add(idx);
double denom = Math.min(volS, gVol - volS);
double cond = cutS / denom;
// It can be practically 0, and that leads to ill-defined results
if (Math.abs(denom) < 1e-10)
{
cond = Double.MAX_VALUE;
}
if (cond < bestCond)
{
bestCond = cond;
// Make a copy of this set
bestSet = new HashSet<>(setToAddTo);
}
}
}
return bestSet;
}
}