PersonalizedPageRank.java example

Explorer
Foundry-master
- Components
/*
 * File:                PersonalizedPageRank.java
 * Authors:             Jeremy D. Wendt
 * Company:             Sandia National Laboratories
 * Project:             Cognitive Foundry
 * 
 * Copyright 2016, Sandia Corporation.
 * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive
 * license for use of this work by or on behalf of the U.S. Government. 
 * Export of this program may require a license from the United States
 * Government. See CopyrightHistory.txt for complete details.
 * 
 */

package gov.sandia.cognition.graph.community;

import gov.sandia.cognition.annotation.PublicationReference;
import gov.sandia.cognition.annotation.PublicationType;
import gov.sandia.cognition.graph.DirectedNodeEdgeGraph;
import gov.sandia.cognition.util.DefaultKeyValuePair;
import gov.sandia.cognition.collection.DoubleArrayList;
import gov.sandia.cognition.collection.IntArrayList;
import gov.sandia.cognition.util.Pair;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.Random;
import java.util.Set;

/**
 * This class can compute PersonalizedPageRank for the input graph and a
 * specified node, and also can determine a community for any specified node.
 * The code permits multiple queries against the same graph, storing the
 * necessary speed-up objects as part of the instance.
 *
 * @author jdwendt
 * @param <NodeNameType> The graph's node name type
 */
@PublicationReference(type = PublicationType.WebPage, title
    = "Personalized PageRank code", author = "dgleich", year = 2016, url
    = "https://gist.github.com/dgleich/6201856")
public class PersonalizedPageRank<NodeNameType>
{

    /**
     * Yale-format-like representation of the neighbors of each node (see
     * http://en.wikipedia.org/wiki/Sparse_matrix#Yale_format). This contains
     * the ids of all neighbors of all nodes in node-order. To figure out a
     * specific node's neighbors, look from indices neighborsFirstIdx.get(i) to
     * neighborsFirstIdx.get(i+1).
     */
    private final IntArrayList neighbors;

    /**
     * Yale-format-like representation of the neighbors of each node (see
     * http://en.wikipedia.org/wiki/Sparse_matrix#Yale_format). This specifies
     * the index of the first neighbor in the neighbors list.
     */
    private final IntArrayList neighborsFirstIdx;

    /**
     * Yale-format-like representation of the neighbors of each node (see
     * http://en.wikipedia.org/wiki/Sparse_matrix#Yale_format). This contains
     * the weights of all neighbors of all nodes in node-order. Follows the same
     * order as IntVector neighbors.
     */
    private final DoubleArrayList neighborsWeights;

    /**
     * Stores the weighted degree of each node in the graph
     */
    private final DoubleArrayList nodeWeightedDegree;

    /**
     * Stores a copy of the graph for some of the translation capabilities, etc.
     */
    private final DirectedNodeEdgeGraph<NodeNameType> graph;

    /**
     * Stores the total weight of the edges in the graph times two (as each edge
     * is counted for both directions)
     */
    private final double gVol;

    /**
     * The random number generator for this instance
     */
    private Random generator;

    /**
     * The tolerance for spreading PPR further
     */
    private double pprTolerance;

    /**
     * Initializes all of the internal data-structures for the input graph. Note
     * that if the input graph is altered after passing it to this class, the
     * results for this instance can become unstable.
     *
     * @param graph The graph to compute personalized page rank for
     */
    public PersonalizedPageRank(DirectedNodeEdgeGraph<NodeNameType> graph)
    {
        this(graph, 0.01);
    }

    /**
     * Initializes all of the internal data-structures for the input graph. Note
     * that if the input graph is altered after passing it to this class, the
     * results for this instance can become unstable.
     *
     * @param graph The graph to compute personalized page rank for
     * @param tolerance The tolerance for further spreading PPR. Should be
     * fairly small 0.01 or smaller. The closer to 0, the further it will
     * spread. Setting to 0 could lead to never quite converging (so probably
     * don't do that).
     */
    public PersonalizedPageRank(DirectedNodeEdgeGraph<NodeNameType> graph,
        double tolerance)
    {
        this.pprTolerance = tolerance;
        YaleFormatWeightedNeighbors<NodeNameType> neigh
            = new YaleFormatWeightedNeighbors<>(graph, true);
        this.neighbors = neigh.getNeighbors();
        this.neighborsFirstIdx = neigh.getNeighborsFirstIndex();
        this.neighborsWeights = neigh.getNeighborsWeights();
        this.graph = graph;
        this.nodeWeightedDegree = new DoubleArrayList(graph.getNumNodes());
        double tmpgVol = 0;
        for (int i = 0; i < graph.getNumNodes(); ++i)
        {
            this.nodeWeightedDegree.add(0.0);
            for (int j = this.neighborsFirstIdx.get(i); j
                < this.neighborsFirstIdx.get(i + 1); ++j)
            {
                this.nodeWeightedDegree.plusEquals(i, this.neighborsWeights.get(
                    j));
                tmpgVol += this.neighborsWeights.get(j);
            }
        }
        this.gVol = tmpgVol;
        this.generator = new Random();
    }

    /**
     * Set the tolerance to a new value. Should be small (0, 0.01]. The closer
     * to 0, the further it will spread. Setting to 0 could lead to never quite
     * converging (so probably don't do that).
     *
     * @param tolerance The tolerance for further spreading PPR.
     */
    public void setTolerance(double tolerance)
    {
        pprTolerance = tolerance;
    }

    /**
     * Initialize the random number generator with the input seed.
     *
     * @param seed The seed for the random number generator
     */
    public void setRandomSet(long seed)
    {
        generator = new Random(seed);
    }

    /**
     * Returns the vector of all scores for all nodes in the graph (order
     * determined by node order as stored in the graph)
     *
     * @param node The node to use as seed
     * @return the vector of all scores for all nodes in the graph
     */
    public DoubleArrayList getScoresForAllNodes(NodeNameType node)
    {
        return getScoresForAllNodesById(graph.getNodeId(node));
    }

    /**
     * Returns the vector of all scores for all nodes in the graph (order
     * determined by node order as stored in the graph)
     *
     * @param nodeIdx The node index to use as seed
     * @return the vector of all scores for all nodes in the graph
     */
    public DoubleArrayList getScoresForAllNodesById(int nodeIdx)
    {
        return getScoresForAllNodesByIds(Collections.singletonList(nodeIdx),
            false);
    }

    /**
     * Returns the vector of all scores for all nodes in the graph (order
     * determined by node order as stored in the graph) as specified for the
     * input seeds
     *
     * @param nodes The nodes to use as seed
     * @return the vector of all scores for all nodes in the graph
     */
    public DoubleArrayList getScoresForAllNodes(List<NodeNameType> nodes)
    {
        return getScoresForAllNodesByIds(convertToIds(nodes), false);
    }

    /**
     * Returns the vector of all scores for all nodes in the graph (order
     * determined by node order as stored in the graph)
     *
     * @param node The node to use as seed
     * @param randomized If true, the order nodes are treated within the
     * algorithm will be randomized so that you can get an average result across
     * multiple runs or just realize that any single run is not the completely
     * true answer.
     * @return the vector of all scores for all nodes in the graph
     */
    public DoubleArrayList getScoresForAllNodes(NodeNameType node,
        boolean randomized)
    {
        return getScoresForAllNodesById(graph.getNodeId(node), randomized);
    }

    /**
     * Returns the vector of all scores for all nodes in the graph (order
     * determined by node order as stored in the graph)
     *
     * @param nodeIdx The node index to use as seed
     * @param randomized If true, the order nodes are treated within the
     * algorithm will be randomized so that you can get an average result across
     * multiple runs or just realize that any single run is not the completely
     * true answer.
     * @return the vector of all scores for all nodes in the graph
     */
    public DoubleArrayList getScoresForAllNodesById(int nodeIdx,
        boolean randomized)
    {
        return getScoresForAllNodesByIds(Collections.singletonList(nodeIdx),
            randomized);
    }

    /**
     * Returns the vector of all scores for all nodes in the graph (order
     * determined by node order as stored in the graph) as specified for the
     * input seeds
     *
     * @param nodes The nodes to use as seed
     * @param randomized If true, the order nodes are treated within the
     * algorithm will be randomized so that you can get an average result across
     * multiple runs or just realize that any single run is not the completely
     * true answer.
     * @return the vector of all scores for all nodes in the graph
     */
    public DoubleArrayList getScoresForAllNodes(List<NodeNameType> nodes,
        boolean randomized)
    {
        return getScoresForAllNodesByIds(convertToIds(nodes), randomized);
    }

    /**
     * Returns the vector of all scores for all nodes in the graph (order
     * determined by node order as stored in the graph) as specified for the
     * input seed indices
     *
     * @param nodeIdxs The node indices to use as seed
     * @return the vector of all scores for all nodes in the graph
     */
    public DoubleArrayList getScoresForAllNodesByIds(List<Integer> nodeIdxs)
    {
        return getScoresForAllNodesByIds(nodeIdxs, false);
    }

    /**
     * Returns the vector of all scores for all nodes in the graph (order
     * determined by node order as stored in the graph) as specified for the
     * input seed indices
     *
     * @param nodeIdxs The node indices to use as seed
     * @param randomized If true, the order nodes are treated within the
     * algorithm will be randomized so that you can get an average result across
     * multiple runs or just realize that any single run is not the completely
     * true answer.
     * @return the vector of all scores for all nodes in the graph
     */
    public DoubleArrayList getScoresForAllNodesByIds(List<Integer> nodeIdxs,
        boolean randomized)
    {
        final double ALPHA = 0.99;
        final double TOL = pprTolerance;

        Queue<Integer> fifo = new LinkedList<>();
        DoubleArrayList residual = new DoubleArrayList(graph.getNumNodes());
        DoubleArrayList x = new DoubleArrayList(graph.getNumNodes());
        for (int i = 0; i < graph.getNumNodes(); ++i)
        {
            residual.add(0.0);
            x.add(0.0);
        }
        double init = 1.0 / nodeIdxs.size();
        for (int n : nodeIdxs)
        {
            residual.set(n, init);
            fifo.add(n);
        }

        while (!fifo.isEmpty())
        {
            int v = fifo.remove();
            x.plusEquals(v, (1 - ALPHA) * residual.get(v));
            double mass = ALPHA * residual.get(v) / (2 * nodeWeightedDegree.get(
                v));
            IntArrayList range = IntArrayList.range(neighborsFirstIdx.get(v),
                neighborsFirstIdx.get(v + 1));
            if (randomized)
            {
                range.randomizeOrder(generator);
            }
            for (int m = 0; m < range.size(); ++m)
            {
                int i = range.get(m);
                int u = neighbors.get(i);
                if (u == v)
                {
                    throw new RuntimeException(
                        "This line should be unreachable.");
                }
                // The first part of the if insures u is not already in the queue
                if ((residual.get(u) < nodeWeightedDegree.get(u) * TOL)
                    && (residual.get(u) + mass * neighborsWeights.get(i)
                    >= nodeWeightedDegree.get(u) * TOL))
                {
                    fifo.add(u);
                }
                residual.plusEquals(u, mass * neighborsWeights.get(i));
            }
            residual.set(v, mass * nodeWeightedDegree.get(v));
            if (residual.get(v) >= nodeWeightedDegree.get(v) * TOL)
            {
                fifo.add(v);
            }
        }

        return x;
    }

    /**
     * Returns the vector of all scores for all nodes in the graph (order
     * determined by node order as stored in the graph) as specified for the
     * input seed. This implementation uses the random version and averages over
     * numRuns random runs.
     *
     * @param node The seed node to consider
     * @param numRuns The number of runs to perform
     * @return the vector of all scores for all nodes in the graph
     */
    public DoubleArrayList getScoresForAllNodesMultirun(NodeNameType node,
        int numRuns)
    {
        return getScoresForAllNodesByIdMultirun(graph.getNodeId(node), numRuns);
    }

    /**
     * Returns the vector of all scores for all nodes in the graph (order
     * determined by node order as stored in the graph) as specified for the
     * input seed index. This implementation uses the random version and
     * averages over numRuns random runs.
     *
     * @param nodes The seed nodes
     * @param numRuns The number of runs to perform
     * @return the vector of all scores for all nodes in the graph
     */
    public DoubleArrayList getScoresForAllNodesMultirun(List<NodeNameType> nodes,
        int numRuns)
    {
        return getScoresForAllNodesByIdMultirun(convertToIds(nodes), numRuns);
    }

    /**
     * Returns the vector of all scores for all nodes in the graph (order
     * determined by node order as stored in the graph) as specified for the
     * input seed index. This implementation uses the random version and
     * averages over numRuns random runs.
     *
     * @param nodeIdx The seed node's id
     * @param numRuns The number of runs to perform
     * @return the vector of all scores for all nodes in the graph
     */
    public DoubleArrayList getScoresForAllNodesByIdMultirun(int nodeIdx,
        int numRuns)
    {
        return getScoresForAllNodesByIdMultirun(Collections.singletonList(
            nodeIdx), numRuns);
    }

    /**
     * Returns the vector of all scores for all nodes in the graph (order
     * determined by node order as stored in the graph) as specified for the
     * input seed index. This implementation uses the random version and
     * averages over numRuns random runs.
     *
     * @param nodeIdxs The seed nodes' id
     * @param numRuns The number of runs to perform
     * @return the vector of all scores for all nodes in the graph
     */
    public DoubleArrayList getScoresForAllNodesByIdMultirun(List<Integer> nodeIdxs,
        int numRuns)
    {
        DoubleArrayList x = DoubleArrayList.zeros(graph.getNumNodes());
        for (int i = 0; i < numRuns; ++i)
        {
            DoubleArrayList tmp = getScoresForAllNodesByIds(nodeIdxs, true);
            for (int j = 0; j < tmp.size(); ++j)
            {
                x.plusEquals(j, tmp.get(j));
            }
        }
        double scalar = 1.0 / numRuns;
        for (int j = 0; j < x.size(); ++j)
        {
            x.set(j, x.get(j) * scalar);
        }

        return x;
    }

    /**
     * Computes the best community for the input node id by personalized page
     * rank scores and conductance of cut. Note both the PPR computation and the
     * community-via-conducntance computation contain order-dependence.
     * Therefore, this code always uses a random ordering. For PPR, it returns
     * the average PPR across all of the runs. For the community determination,
     * it returns the community with the best conductance based as found by
     * following edges in different orders.
     *
     * @param node The node whose community is desired
     * @param numRunsPpr The number of randomized runs to perform for the PPR
     * computation
     * @param numRunsCut The number of randomized runs to perform for community
     * determination
     * @return
     */
    public Set<Integer> getCommunityForNode(NodeNameType node,
        int numRunsPpr,
        int numRunsCut)
    {
        return getCommunityForNodeById(graph.getNodeId(node), numRunsPpr,
            numRunsCut);
    }

    /**
     * Computes the best community for the input node id by personalized page
     * rank scores and conductance of cut. Note both the PPR computation and the
     * community-via-conducntance computation contain order-dependence.
     * Therefore, this code always uses a random ordering. For PPR, it returns
     * the average PPR across all of the runs. For the community determination,
     * it returns the community with the best conductance based as found by
     * following edges in different orders.
     *
     * @param nodeIdx The node id whose community is desired
     * @param numRunsPpr The number of randomized runs to perform for the PPR
     * computation
     * @param numRunsCut The number of randomized runs to perform for community
     * determination
     * @return
     */
    public Set<Integer> getCommunityForNodeById(int nodeIdx,
        int numRunsPpr,
        int numRunsCut)
    {
        return getCommunityForNodesById(Collections.singletonList(nodeIdx),
            numRunsPpr, numRunsCut);
    }

    /**
     * Computes the best community for the input node id by personalized page
     * rank scores and conductance of cut. Note both the PPR computation and the
     * community-via-conducntance computation contain order-dependence.
     * Therefore, this code always uses a random ordering. For PPR, it returns
     * the average PPR across all of the runs. For the community determination,
     * it returns the community with the best conductance based as found by
     * following edges in different orders.
     *
     * @param nodes The node(s) whose community is desired
     * @param numRunsPpr The number of randomized runs to perform for the PPR
     * computation
     * @param numRunsCut The number of randomized runs to perform for community
     * determination
     * @return
     */
    public Set<Integer> getCommunityForNodes(List<NodeNameType> nodes,
        int numRunsPpr,
        int numRunsCut)
    {
        return getCommunityForNodesById(convertToIds(nodes), numRunsPpr,
            numRunsCut);
    }

    /**
     * Private helper that converts a list of node names to a list of node ids
     *
     * @param nodes The node names to convert
     * @return the node ids
     */
    private List<Integer> convertToIds(List<NodeNameType> nodes)
    {
        List<Integer> nodeIds = new ArrayList<>(nodes.size());
        for (NodeNameType node : nodes)
        {
            nodeIds.add(graph.getNodeId(node));
        }
        return nodeIds;
    }

    /**
     * Computes the best community for the input node id by personalized page
     * rank scores and conductance of cut. Note both the PPR computation and the
     * community-via-conducntance computation contain order-dependence.
     * Therefore, this code always uses a random ordering. For PPR, it returns
     * the average PPR across all of the runs. For the community determination,
     * it returns the community with the best conductance based as found by
     * following edges in different orders.
     *
     * @param nodeIdxs The node id(s) whose community is desired
     * @param numRunsPpr The number of randomized runs to perform for the PPR
     * computation
     * @param numRunsCut The number of randomized runs to perform for community
     * determination
     * @return
     */
    public Set<Integer> getCommunityForNodesById(List<Integer> nodeIdxs,
        int numRunsPpr,
        int numRunsCut)
    {
        DoubleArrayList x = getScoresForAllNodesByIdMultirun(nodeIdxs, numRunsPpr);
        for (int i = 0; i < graph.getNumNodes(); ++i)
        {
            x.set(i, x.get(i) / nodeWeightedDegree.get(i));
        }

        List<Pair<Integer, Double>> sorted = new ArrayList<>(x.size());
        for (int i = 0; i < x.size(); ++i)
        {
            // Don't add unvisited nodes
            if (x.get(i) == 0)
            {
                continue;
            }
            sorted.add(new DefaultKeyValuePair<>(i, x.get(i)));
        }
        Collections.sort(sorted, new Comparator<Pair<Integer, Double>>()
        {

            @Override
            public int compare(Pair<Integer, Double> o1,
                Pair<Integer, Double> o2)
            {
                return Double.compare(o2.getSecond(), o1.getSecond());
            }

        });
        double bestCond = Double.MAX_VALUE;
        Set<Integer> bestSet = null;
        for (int r = 0; r < numRunsCut; ++r)
        {
            double volS = 0.0;
            double cutS = 0.0;
            Set<Integer> setToAddTo = new HashSet<>(x.size());
            for (int i = 0; i < sorted.size(); ++i)
            {
                int idx = sorted.get(i).getFirst();
                volS += nodeWeightedDegree.get(idx);
                IntArrayList loop = IntArrayList.range(neighborsFirstIdx.get(idx),
                    neighborsFirstIdx.get(idx + 1));
                loop.randomizeOrder(generator);
                for (int s = 0; s < loop.size(); ++s)
                {
                    int j = loop.get(s);
                    if (setToAddTo.contains(neighbors.get(j)))
                    {
                        cutS -= neighborsWeights.get(j);
                    }
                    else
                    {
                        cutS += neighborsWeights.get(j);
                    }
                }
                setToAddTo.add(idx);
                double denom = Math.min(volS, gVol - volS);
                double cond = cutS / denom;
                // It can be practically 0, and that leads to ill-defined results
                if (Math.abs(denom) < 1e-10)
                {
                    cond = Double.MAX_VALUE;
                }
                if (cond < bestCond)
                {
                    bestCond = cond;
                    // Make a copy of this set
                    bestSet = new HashSet<>(setToAddTo);
                }
            }
        }
        return bestSet;
    }

}