package edu.cmu.graphchi.walks; import com.yammer.metrics.core.TimerContext; import edu.cmu.graphchi.ChiLogger; import edu.cmu.graphchi.Scheduler; import edu.cmu.graphchi.engine.VertexInterval; import java.io.*; import java.util.*; import java.util.logging.Logger; /** * Manager for random walks * * Done partially during authors internship at Twitter, Fall 2012. * @author Aapo Kyrola, akyrola@cs.cmu.edu */ public class IntWalkManager extends WalkManager { protected int[][] walks; private static final Logger logger = ChiLogger.getLogger("int-walk-manager"); public IntWalkManager(int numVertices, int numSources) { super(numVertices, numSources); } @Override protected void setSourceAndBucketBits() { MAX_SOURCES = 16777216; // 24 bits for the source id bucketSize = 128; // 7 bits for the bucket size } /** * Encode a walk. Note, as sourceIdx is the highest order bits, the * walks can be sorted by source simply by sorting the list. * @param sourceId index of the rousce vertex * @param hop true if odd, false if even * @param off bucket offset * @return */ int encode(int sourceId, boolean hop, int off) { assert(off < 128); int hopbit = (hop ? 1 : 0); return ((sourceId & 0xffffff) << 8) | ((off & 0x7f) << 1) | hopbit; } int encodeV(int sourceId, boolean hop, int vertexId) { return encode(sourceId, hop, vertexId % bucketSize); } int encodeNewWalk(int sourceId, int sourceVertex, boolean hop) { return encode(sourceId, hop, sourceVertex % bucketSize); } public int sourceIdx(int walk) { return ((walk & 0xffffff00) >> 8) & 0xffffff; } public boolean trackBit(int walk) { return ((walk & 1) != 0); } public int off(int walk) { return (walk >> 1) & 0x7f; } /** * Resets the bucket offset to reflect the new destination vertex, and also resets the track * bit, according to the parameters. Note that those are the _only_ things re-encoded by this * method, as those are the only things this method has access to; if other parts of the walk * need to be changed, that must be taken care of in the WalkUpdateFunction _before_ forwarding * the walk. */ public int reencodeWalk(int walk, int toVertex, boolean trackBit) { int bucket = toVertex / bucketSize; return encode(sourceIdx(walk), trackBit, toVertex % bucketSize); } /** * @param sourceId * @param toVertex * @param trackBit true if odd, false if even hop */ public void moveWalk(int walk, int toVertex, boolean trackBit) { int bucket = toVertex / bucketSize; synchronized (bucketLocks[bucket]) { moveWalkUnsafe(walk, toVertex, trackBit); } } public void moveWalkUnsafe(int walk, int toVertex, boolean trackBit) { // Re-encode the walk to reflect the movement walk = reencodeWalk(walk, toVertex, trackBit); // Move the walk to the new bucket for processing int bucket = toVertex / bucketSize; int idx = walkIndices[bucket]; if (idx == 0) { walks[bucket] = new int[initialSize]; } else { if (idx == walks[bucket].length) { int[] newBucket = new int[walks[bucket].length * 3 / 2]; System.arraycopy(walks[bucket], 0, newBucket, 0, walks[bucket].length); walks[bucket] = newBucket; } } walks[bucket][idx] = walk; walkIndices[bucket]++; } @Override protected void expandCapacity(int bucket, int additional) { if (walks[bucket] != null) { int desiredLength = walks[bucket].length + additional; if (walks[bucket].length < desiredLength) { int[] newBucket = new int[desiredLength]; System.arraycopy(walks[bucket], 0, newBucket, 0, walks[bucket].length); walks[bucket] = newBucket; } } else { walks[bucket] = new int[additional]; } } @Override public void initializeWalks() { walks = new int[1 + numVertices / bucketSize][]; bucketLocks = new Object[walks.length]; for(int i=0; i<bucketLocks.length; i++) bucketLocks[i] = new Object(); walkIndices = new int[walks.length]; for(int i = 0; i < walks.length; i++) { walks[i] = null; walkIndices[i] = 0; } /* Truncate sources */ if (sourceSeqIdx < sources.length) { logger.info("Truncating..."); int[] tmpsrcs = new int[sourceSeqIdx]; System.arraycopy(sources, 0, tmpsrcs, 0, sourceSeqIdx); sources = tmpsrcs; } logger.info("Calculate sizes. Walks length:" + walks.length); /* Precalculate bucket sizes for performance */ int[] tmpsizes = new int[walks.length]; for(int j=0; j < sourceSeqIdx; j++) { int source = sources[j]; tmpsizes[source / bucketSize] += sourceWalkCounts[j]; } logger.info("Expand capacities"); for(int b=0; b < walks.length; b++) { expandCapacity(b, tmpsizes[b]); } logger.info("Allocating walks"); for(int i=0; i < sourceSeqIdx; i++) { int source = sources[i]; int count = sourceWalkCounts[i]; int walk = encodeNewWalk(i, source, false); int bucket = source / bucketSize; int idx = walkIndices[bucket]; for(int c=0; c<count; c++) { walks[bucket][idx++] = walk; } walkIndices[bucket] += count; if (i % 100000 == 0) logger.info(i + " / " + sourceSeqIdx); } sourceWalkCounts = null; logger.info("Set bitset..."); // Create source-bitset for(int i=0; i < sourceSeqIdx; i++) { sourceBitSet.set(sources[i], true); } } @Override public WalkSnapshot grabSnapshot(final int fromVertex, final int toVertexInclusive) { final int fromBucket = fromVertex / bucketSize; final int toBucket = toVertexInclusive / bucketSize; final boolean[] snapshotInitBits = new boolean[toBucket - fromBucket + 1]; final boolean[] processedBits = new boolean[1 + toVertexInclusive - fromVertex]; for(int b=fromBucket; b <= toBucket; b++) { snapshotInitBits[b - fromBucket] = false; } /* Now create data structure for fast retrieval */ final int[][] snapshots = new int[toVertexInclusive - fromVertex + 1][]; /* Create the snapshot object. It creates the snapshot arrays on-demand * to save memory. */ return new WalkSnapshot() { @Override public void clear(int vertexId) { snapshots[vertexId - fromVertex] = null; } @Override public void restoreUngrabbed() { final TimerContext _timer = restore.time(); // Restore such walks that were not grabbed (because the vertex // was not initially scheduled) int v = fromVertex; int restoreCount = 0; for(int[] snapshot : snapshots) { if (snapshot != null && !processedBits[v - fromVertex]) { for(int i=0; i<snapshot.length; i++) { int w = snapshot[i]; moveWalk(w, v, trackBit(w)); restoreCount++; } } v++; } logger.info("Restored " + restoreCount); _timer.stop(); } // Note: accurate number only before snapshot is being purged public long numWalks() { long sum = 0; for(int b=fromBucket; b <= toBucket; b++) { sum += walks[b].length; } return sum; } @Override public WalkArray getWalksAtVertex(int vertexId, boolean processed) { int bucketIdx = vertexId / bucketSize; int localBucketIdx = bucketIdx - (fromVertex / bucketSize); processedBits[vertexId - fromVertex] = true; if (snapshotInitBits[localBucketIdx]) { int[] array = snapshots[vertexId - fromVertex]; if (array == null) { return null; } else { return new IntWalkArray(snapshots[vertexId - fromVertex]); } } else { final TimerContext _timer = grabTimer.time(); int[] bucketToConsume = null; int len = 0; synchronized (bucketLocks[bucketIdx]) { if (!snapshotInitBits[localBucketIdx]) { int bucketFirstVertex = bucketSize * bucketIdx; len = walkIndices[bucketIdx]; bucketToConsume = walks[bucketIdx]; if (bucketToConsume != null) { walks[bucketIdx] = null; walkIndices[bucketIdx] = 0; final int[] snapshotSizes = new int[bucketSize]; final int[] snapshotIdxs = new int[bucketSize]; /* Calculate vertex-walks sizes */ for(int i=0; i < len; i++) { int w = bucketToConsume[i]; snapshotSizes[off(w)]++; } int offt = bucketFirstVertex - fromVertex; for(int i=0; i < snapshotSizes.length; i++) { if (snapshotSizes[i] > 0 && i >= -offt && i + offt < snapshots.length) snapshots[i + offt] = new int[snapshotSizes[i]]; } for(int i=0; i < len; i++) { int w = bucketToConsume[i]; int vertex = bucketFirstVertex + off(w); if (vertex >= fromVertex && vertex <= toVertexInclusive) { int snapshotOff = vertex - fromVertex; int localOff = vertex - bucketFirstVertex; snapshots[snapshotOff][snapshotIdxs[localOff]] = w; snapshotIdxs[localOff]++; } else { // add back moveWalk(w, vertex, trackBit(w)); } } } snapshotInitBits[localBucketIdx] = true; } } if (bucketConsumer != null && bucketToConsume != null && len > 0) { bucketConsumer.consume(bucketIdx * bucketSize, new IntWalkArray(bucketToConsume), len); if (len > 1000000) { log((bucketIdx * bucketSize) + " - " + ((bucketIdx+1)) * bucketSize + ", " + len); } } _timer.stop(); int[] array = snapshots[vertexId - fromVertex]; if (array == null) { return null; } else { return new IntWalkArray(snapshots[vertexId - fromVertex]); } } } @Override public int getFirstVertex() { return fromVertex; } @Override public int getLastVertex() { return toVertexInclusive; } }; } /** Dump to file all walks with more than 0 hop */ @Override public void dumpToFile(WalkSnapshot snapshot, String filename) throws IOException { final TimerContext _timer = dumpTimer.time(); synchronized (filename.intern()) { DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(new File(filename), true))); for(int i=snapshot.getFirstVertex(); i <= snapshot.getLastVertex(); i++) { int[] ws = ((IntWalkArray)snapshot.getWalksAtVertex(i, false)).getArray(); if (ws != null) { for(int j=0; j < ws.length; j++) { int w = ws[j]; int source = sources[sourceIdx(w)]; dos.writeInt(source); dos.writeInt(i); } } } dos.flush(); dos.close(); } _timer.stop(); } public int getSourceVertex(int walk) { return sources[sourceIdx(walk)]; } @Override public void populateSchedulerForInterval(Scheduler scheduler, VertexInterval interval) { final TimerContext _timer = schedulePopulate.time(); int fromBucket = interval.getFirstVertex() / bucketSize; int toBucket = interval.getLastVertex() / bucketSize; for(int bucketIdx=fromBucket; bucketIdx <= toBucket; bucketIdx++) { int vertexBase = bucketIdx * bucketSize; int[] bucket = walks[bucketIdx]; if (bucket != null) { BitSet alreadySeen = new BitSet(bucketSize); int counter = 0; for(int j=0; j<bucket.length; j++) { int off = off(bucket[j]); if (!alreadySeen.get(off)) { alreadySeen.set(off, true); counter++; scheduler.addTask(vertexBase + off); if (counter == bucketSize) break; } } } } _timer.stop(); } }