package edu.cmu.graphchi.walks.analysis; import java.io.*; import java.util.*; /** * Template code for computing paths from the walk-files produced * by DrunkardMobForPaths * * Usage: * [directory-containing the walks] [min-walk-id to process] [max-walk-id to process] [maxHops] * * The idea is to process the walks in such chunks that they can be processed in memory. * For example, if you did 1 billion walks of 10 hops, perhaps analyzing 200 million walks a time * makes sense, and first run with min-walk-id 0 and max-walkid 200,000,000, then with * min-walkid 200,000,001 etc... (On a machine with ~ 100 gig of memory). * CAUTION: Java's memory allocation is a bit hard to estimate, so you need to try out * good chunk sizes. * * Process: * 1. compute the path for each walk id * 2. group by walk source * 3. group walks by source by the destination * 4. group by path type (function for returning path-type is just a mock) * 5. output the distribution: for source-walk,path-type,count * * @author Aapo Kyrola, akyrola@cs.cmu.edu */ public class WalkPathAnalyzerTemplate { private File directory; private BufferedWriter output; public WalkPathAnalyzerTemplate(File directory) throws IOException { this.directory = directory; if (!this.directory.isDirectory()) throw new IllegalArgumentException("You must provide a directory"); output = new BufferedWriter(new FileWriter("walkoutput")); } /** * Currently very dummy implementation. TODO: Make memory efficient and smarter in general. * Just for demonstration purposes. */ public void analyze(int minWalkId, int maxWalkId, int maxHops) throws IOException { int numberOfWalks = maxWalkId - minWalkId + 1; Walk[] paths = new Walk[numberOfWalks]; for(int i=0; i < paths.length; i++) { paths[i] = new Walk(maxHops); } String[] walkFiles = directory.list(new FilenameFilter() { @Override public boolean accept(File file, String s) { return s.startsWith("walks_"); } }); for(String walkFile : walkFiles) { System.out.println("Analyze: " + walkFile); long walksInFile = new File(directory, walkFile).length() / 10; DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream( new File(directory, walkFile)), 1024 * 1024 * 50)); try { long i = 0; while(i < walksInFile) { if (i % 1000000 == 0) System.out.println(i + " / " + walksInFile); i++; int walkId = dis.readInt(); short hop = dis.readShort(); int atVertex = dis.readInt(); if (walkId >= minWalkId && walkId <= maxWalkId) { paths[walkId - minWalkId].addWalk(hop, atVertex); } } } catch (EOFException ioe) { continue; } dis.close(); } for(Walk w : paths) { w.sort(); } /* Analyze the walks */ groupBySourceAndAnalyze(paths); output.close(); } /** * Since all walk-ids from a given source are in a consequent * interval, we can easily select walks from one source, and * then analyze them based on destination. * @param paths */ private void groupBySourceAndAnalyze(Walk[] paths) { int curSource = -1; ArrayList<Walk> curSet = new ArrayList<Walk>(); for(int j=0; j < paths.length; j++) { Walk w = paths[j]; if (w.getSource() != curSource) { if (curSource != -1) processWalksFromSource(curSource, curSet); curSource = w.getSource(); curSet = new ArrayList<Walk>(); } curSet.add(w); } // Last processWalksFromSource(curSource, curSet); } private void processWalksFromSource(int source, ArrayList<Walk> walksFromSource) { // Now sort by target Collections.sort(walksFromSource, new Comparator<Walk>() { @Override public int compare(Walk walk1, Walk walk2) { int dest1 = walk1.getDestination(); int dest2 = walk2.getDestination(); return (dest1 == dest2 ? 0 : (dest1 < dest2 ? -1 : 1)) ; } }); // Group by target int curDest = -1; ArrayList<Walk> curSet = new ArrayList<Walk>(); for(Walk w : walksFromSource) { if (w.getDestination() != curDest) { if (curDest != -1) handleSourcePathSet(source, curDest, curSet); curDest = w.getDestination(); curSet = new ArrayList<Walk>(); } curSet.add(w); } } /** * Now we have a set of walks that share source and destination. * Compute distribution of path types. * @param pathSet */ private void handleSourcePathSet(int source, int dst, ArrayList<Walk> pathSet) { HashMap<String, Integer> pathDist = new HashMap<String, Integer>(); for(Walk w : pathSet) { String pathType = getPathType(w); if (pathDist.containsKey(pathType)) { pathDist.put(pathType, pathDist.get(pathType) + 1); } else { pathDist.put(pathType, 1); } } try { // Write the distribution out to a file. You probably want to replace this. for(Map.Entry<String, Integer> pathCount : pathDist.entrySet()) { output.write(source + "-" + dst +"," + pathCount.getKey() + "," + pathCount.getValue() + "\n"); } } catch (IOException ioe) { ioe.printStackTrace(); } } // Mock implementation of get-path-type // TODO public String getPathType(Walk w) { // Fake! int[] p = w.getPath(); StringBuffer sb = new StringBuffer(); for(int j=0; j<p.length; j++) { sb.append(p[j] % 2); sb.append("-"); } return sb.toString(); } private static class Walk { private long[] path; int idx; private Walk(int maxHops) { idx = 0; path = new long[maxHops]; } void addWalk(short hop, int atVertex) { long w = atVertex | ((long)hop << 32); if (idx < path.length) path[idx++] = w; } int getSource() { return (int) (path[0] & 0xffffffffl); } int getDestination() { return (int) (path[idx - 1] & 0xffffffffl); } int[] getPath() { int[] p = new int[idx]; for(int i=0; i<idx; i++) { p[i] = (int) (path[i] & 0xffffffffl); } return p; } void sort() { Arrays.sort(path); } String getPathDescription() { /* Super-slow */ Arrays.sort(path); // Hop is the highest order bit so sorts by hop StringBuffer sb = new StringBuffer(); for(long w : path) { sb.append((w & 0xffffffffl) + "-"); } return sb.toString(); } } // Usage: // [directory-containing the walks] [min-walk-id] [max-walk-id to process] [maxHops] public static void main(String[] args) throws Exception { WalkPathAnalyzerTemplate analyzer = new WalkPathAnalyzerTemplate(new File(args[0])); int minWalkId = Integer.parseInt(args[1]); int maxWalkId = Integer.parseInt(args[2]); int maxHops = Integer.parseInt(args[3]); analyzer.analyze(minWalkId, maxWalkId, maxHops); } }