package edu.cmu.graphchi.apps.recommendations; import edu.cmu.graphchi.ChiFilenames; import edu.cmu.graphchi.ChiLogger; import edu.cmu.graphchi.preprocessing.VertexIdTranslate; import edu.cmu.graphchi.queries.VertexQuery; import java.io.*; import java.util.*; import java.util.logging.Logger; /** * Emulates Twitter's Who-To-Follow (WTF) algorithm's SALSA part as described in WWW'13 paper * WTF: The Who to Follow Service at Twitter: http://www.stanford.edu/~rezab/papers/wtf_overview.pdf * * This demonstration loads the followers * of the "circle of trust" (top visited vertices in egocentric random walk) directly * from the shards using the edu.cmu.graphchi.queries.VertexQuery class. Then SALSA * algorithm is run so that the "circle of trust" is on the left as "hubs" and their followers * on the right as "authorities". * * Circle of trust must be given externally to this class. <b>Note:</b> work in progress. * @author Aapo Kyrola */ // TODO: Make multithreaded / thread-safe. public class CircleOfTrustSalsa { private static final Logger logger = ChiLogger.getLogger("circle-of-trust"); static class SalsaVertex { int id; int degree = 0; SalsaVertex(int id) { this.id = id; } double value = 1.0; ArrayList<Integer> neighbors; } private VertexQuery queryService; // Neighbor list private HashMap<Integer, SalsaVertex> hubs; private HashMap<Integer, SalsaVertex> authorities; // Static cache static private Map<Integer, ArrayList<Integer>> cache; private String graphName; private static final int FILTER_LIMIT = 4; // NOTE: non-thread safe public CircleOfTrustSalsa(VertexQuery queryService, final int cacheSize) throws Exception { this.queryService = queryService; synchronized (CircleOfTrustSalsa.class) { if (cache == null) { cache = Collections.synchronizedMap(new LinkedHashMap<Integer, ArrayList<Integer>>(cacheSize, 1.0f, true) // LRU { @Override protected boolean removeEldestEntry(Map.Entry<Integer, ArrayList<Integer>> integerArrayListEntry) { return this.size() > cacheSize; } }); } } } public void initializeGraph(Collection<Integer> circleOfTrust) { hubs = new HashMap<Integer, SalsaVertex>(circleOfTrust.size(), 1.0f); long t = System.currentTimeMillis(); int totalNeighbors = 0; int cacheHits = 0; HashSet<Integer> querySet = new HashSet<Integer>(circleOfTrust.size()); for(int v : circleOfTrust) { hubs.put(v, new SalsaVertex(v)); if (cache.containsKey(v)) { SalsaVertex hub = hubs.get(v); hub.neighbors = cache.get(v); hub.degree = hub.neighbors.size(); cacheHits++; } else { querySet.add(v); } } /* Load neighbors of the circle of trust -- we probably would need some limitation here?? */ HashMap<Integer, ArrayList<Integer>> hubNeighbors = queryService.queryOutNeighbors(querySet); long queryTime = System.currentTimeMillis() - t; /* Initialize salsa */ t = System.currentTimeMillis(); for(Map.Entry<Integer, ArrayList<Integer>> entry: hubNeighbors.entrySet()) { int hubId = entry.getKey(); SalsaVertex hub = hubs.get(hubId); hub.neighbors = entry.getValue(); hub.degree = entry.getValue().size(); cache.put(hubId, entry.getValue()); } // Count total neighbors for(SalsaVertex hub : hubs.values()) { totalNeighbors += hub.neighbors.size(); } long salsaInitTime0 = System.currentTimeMillis() - t; // We do not add neighbors to authorities -- we can push values to authorities // and pull to hubs. int[] authEntries = new int[totalNeighbors]; int j = 0; for(SalsaVertex hub : hubs.values()) { for(int authId : hub.neighbors) { authEntries[j++] = authId; } } assert(j == authEntries.length); // Create map efficiently Arrays.sort(authEntries); int lastId = -1; int count = 0; // int filtered = 0; ArrayList<SalsaVertex> tmpAuth = new ArrayList<SalsaVertex>(1 + authEntries.length / 100); for(int i=0; i < authEntries.length; i++) { int authId = authEntries[i]; if (lastId != authId) { if (lastId >= 0) { if (count > FILTER_LIMIT) { SalsaVertex auth = new SalsaVertex(lastId); auth.degree = count; tmpAuth.add(auth); } else { // filtered++; } count = 0; } lastId = authId; } count++; } authorities = new HashMap<Integer, SalsaVertex>(tmpAuth.size()); for(SalsaVertex auth : tmpAuth) { authorities.put(auth.id, auth); } // NOTE: remove neighbors! /* long salsaInitTime = System.currentTimeMillis() - t; logger.info("Query took: " + queryTime + " ms, circle of trust size=" + circleOfTrust.size() + ", cache size=" + cache.size() + ", hits=" + cacheHits); logger.info("Salsa init: " + salsaInitTime + " ms, first phase=" + salsaInitTime0 + " ms, hubs=" + hubs.size() + ", auths=" + authorities.size()); logger.info("Filtered: " + filtered); */ } /** * Compute SALSA on graph initialized in initializeGraph() method. * @param nIterations */ public void computeSALSA(int nIterations) { for(int iter=0; iter < nIterations; iter++) { // Hubs: sum of authority-neighbors values divided by their degree for(SalsaVertex hub: hubs.values()) { double nbSum = 0.0; // Update the degree because not all authorities were selected int degree = 0; for(int authId : hub.neighbors) { SalsaVertex auth = authorities.get(authId); if (auth != null) { nbSum += auth.value / auth.degree; degree++; } } hub.value = nbSum; hub.degree = degree; } // Authorities: push from authority side. // First: set values to zero for(SalsaVertex auth: authorities.values()) { auth.value = 0; } // Then, push hubs values to their auths for(SalsaVertex hub: hubs.values()) { double myContribution = hub.value / hub.degree; for(int authId : hub.neighbors) { SalsaVertex auth = authorities.get(authId); if (auth != null) { auth.value += myContribution; } } } } } public VertexQuery getQueryService() { return queryService; } /** * Return top K authorities (result from SALSA), but do not include users in the removeList * @param K * @param removeList * @return */ public ArrayList<SalsaVertex> topAuthorities(int K, HashSet<Integer> removeList) { // TODO: faster top-K implementation ArrayList<SalsaVertex> all = new ArrayList<SalsaVertex>(authorities.size()); all.addAll(authorities.values()); Collections.sort(all, new Comparator<SalsaVertex>() { @Override public int compare(SalsaVertex salsaVertex, SalsaVertex salsaVertex1) { if (salsaVertex.value < salsaVertex1.value) return 1; else return (salsaVertex.value > salsaVertex1.value ? -1 : 0); } }); ArrayList<SalsaVertex> result = new ArrayList<SalsaVertex>(K); int i = 0; while(result.size() < K) { if (i < all.size()) { SalsaVertex x = all.get(i); if (!removeList.contains(x.id)) result.add(x); } else { break; } i++; } return result; } public String namify(Integer value) throws IOException { File f = new File(graphName + "_names.dat"); if (!f.exists()) { System.out.println("didn't find name file: " + f.getPath()); return value+""; } int i = value * 16; RandomAccessFile raf = new RandomAccessFile(f.getAbsolutePath(), "r"); raf.seek(i); byte[] tmp = new byte[16]; raf.read(tmp); raf.close(); return new String(tmp) + "(" + value + ")"; } public static void main(String[] args) throws Exception { String graphName = args[0]; int nShards = Integer.parseInt(args[1]); CircleOfTrustSalsa csalsa = new CircleOfTrustSalsa(new VertexQuery(graphName, nShards), 10000); VertexIdTranslate vertexTrans = VertexIdTranslate.fromFile(new File(ChiFilenames.getVertexTranslateDefFile(graphName, nShards))); BufferedReader cmdIn = new BufferedReader(new InputStreamReader(System.in)); while(true) { System.out.print("Enter vertex id to query >> :: "); String ln = cmdIn.readLine(); int vertex = Integer.parseInt(ln); // Circle of trust is just the vertex's followers for now HashSet<Integer> circle = csalsa.queryService.queryOutNeighbors(vertexTrans.forward(vertex)); int maxCircleSize = 300; // max 500 if (circle.size() > maxCircleSize) { int[] all = new int[circle.size()]; int i = 0; for(Integer v : circle) all[i++] = v; HashSet<Integer> filteredCircle = new HashSet<Integer>(); Random r = new Random(260379); for(i=0; i < maxCircleSize; i++) filteredCircle.add(all[Math.abs(r.nextInt()) % all.length]); circle = filteredCircle; } csalsa.initializeGraph(circle); long t = System.currentTimeMillis(); csalsa.computeSALSA(3); logger.info("SALSA computation took " + (System.currentTimeMillis() - t) + "ms"); circle.add(vertexTrans.forward(vertex)); ArrayList<SalsaVertex> top = csalsa.topAuthorities(20, circle); int j = 1; for(SalsaVertex sv : top) { int originalId = vertexTrans.backward(sv.id); logger.info("Top " + (j++) + " = " + originalId + " " + csalsa.namify(originalId) + " (" + sv.value + ")"); } } } }