DiscreteDistribution.java example

Explorer

graphchi-java-master
- src
  - main
    - java
      - com
        twitter
        pers
        bipartite
        HITSSmallMem.java
        SALSASmallMem.java
        graph_generator
        EdgeListOutput.java
        GraphOutput.java
        RMATGraphGenerator.java
      - edu
        cmu
        graphchi
        ChiEdge.java
        ChiFilenames.java
        ChiLogger.java
        ChiVertex.java
        EdgeDirection.java
        EmptyType.java
        GraphChiContext.java
        GraphChiProgram.java
        Scheduler.java
        apps
        ALSMatrixFactorization.java
        ConnectedComponents.java
        Pagerank.java
        SmokeTest.java
        WeightedPagerank.java
        kcore
        GraphTransformer.java
        KCoreDecomposer.java
        pig
        PigALSMatrixFactorization.java
        PigPagerank.java
        PigWeightedPagerank.java
        randomwalks
        PersonalizedPageRank.java
        recommendations
        CircleOfTrustSalsa.java
        MovieRecommender.java
        TwitterWTF.java
        util
        CreateCassovary.java
        datablocks
        BytesToValueConverter.java
        ChiPointer.java
        DataBlockManager.java
        FloatConverter.java
        FloatPair.java
        FloatPairConverter.java
        IntConverter.java
        LongConverter.java
        engine
        BitsetScheduler.java
        GraphChiEngine.java
        VertexInterval.java
        auxdata
        DegreeData.java
        VertexData.java
        VertexDegree.java
        hadoop
        HDFSGraphLoader.java
        PigGraphChiBase.java
        io
        CompressedIO.java
        preprocessing
        EdgeProcessor.java
        FastSharder.java
        VertexIdTranslate.java
        VertexProcessor.java
        queries
        VertexQuery.java
        demo
        FriendsOfFriends.java
        shards
        MemoryShard.java
        ShardIndex.java
        SlidingShard.java
        toolkits
        collaborative_filtering
        ALS.java
        Common.java
        IO.java
        ProblemSetup.java
        RMSEEngine.java
        util
        DegreeFileReader.java
        FileUtils.java
        HugeDoubleMatrix.java
        HugeFloatMatrix.java
        HugeLongMatrix.java
        IdCount.java
        IdFloat.java
        IdInt.java
        IntegerBuffer.java
        LabelAnalysis.java
        MultinomialSampler.java
        Toplist.java
        vertexdata
        ForeachCallback.java
        VertexAggregator.java
        VertexIdValue.java
        VertexTransformCallBack.java
        VertexTransformer.java
        walks
        BucketsToSend.java
        DrunkardContext.java
        DrunkardDriver.java
        DrunkardFactory.java
        DrunkardJob.java
        DrunkardMobEngine.java
        DumperThread.java
        GrabbedBucketConsumer.java
        IntDrunkardContext.java
        IntDrunkardDriver.java
        IntDrunkardFactory.java
        IntDumperThread.java
        IntLocalWalkBuffer.java
        IntWalkArray.java
        IntWalkManager.java
        LocalWalkBuffer.java
        LongDrunkardContext.java
        LongDrunkardDriver.java
        LongDrunkardFactory.java
        LongDumperThread.java
        LongLocalWalkBuffer.java
        LongWalkArray.java
        LongWalkManager.java
        WalkArray.java
        WalkDistributionAnalyzer.java
        WalkManager.java
        WalkSnapshot.java
        WalkUpdateFunction.java
        WeightedHopper.java
        analysis
        WalkPathAnalyzerTemplate.java
        distributions
        DiscreteDistribution.java
        DrunkardClient.java
        DrunkardCompanion.java
        IntDrunkardCompanion.java
        LongDrunkardCompanion.java
        RemoteDrunkardCompanion.java
        TwoKeyCompanion.java
      - nom
        tam
        util
        BufferedDataInputStream.java
      - ucar
        unidata
        io
        RandomAccessFile.java
- test
  - edu
    - cmu
      - graphchi
        TestChiVertex.java
        datablocks
        TestFloatConverter.java
        TestFloatPairConverter.java
        TestIntConverter.java
        TestLongConverter.java
        preprocessing
        TestIdPacking.java
        TestVertexIdTranslate.java
        util
        TestHugeFloatMatrix.java
        TestIntegerBuffer.java
        walks
        TestWalkManager.java
        TestWeightedHopper.java
        distribution
        TestDiscreteDistribution.java

package edu.cmu.graphchi.walks.distributions;

import edu.cmu.graphchi.util.IdCount;

import java.util.*;

/**
 * Presents a map from integers to frequencies.
 * Special distributions for avoidance can be used to exclude
 * certain ids from the distributions in merges.
 * @author Aapo Kyrola, akyrola@cs.cmu.edu
 */
public class DiscreteDistribution {

    private int[] ids;
    private short[] counts;
    private int uniqueCount;


    private DiscreteDistribution(int initialCapacity) {
        ids = new int[initialCapacity];
        counts = new short[initialCapacity];
        uniqueCount = initialCapacity;
    }

    /**
     * Create an empty distribution
     */
    public DiscreteDistribution() {
        this(0);
    }


    public DiscreteDistribution forceToSize(int size) {
        if (size > this.ids.length) return this;
        int[] newIds = new int[size];
        short[] newCounts = new short[size];
        for(int i=0; i < size; i++) {
            newIds[i] = ids[i];
            newCounts[i] = counts[i];
        }
        DiscreteDistribution d = new DiscreteDistribution();
        d.ids = newIds;
        d.counts = newCounts;
        d.uniqueCount = size;
        return d;
    }

    public int totalCount() {
        int tot = 0;
        for(int c=0; c<counts.length; c++) {
           if (counts[c] > 0) tot += counts[c];
        }
        return tot;
    }


    public void print() {
        for(int i=0; i<ids.length; i++) {
            System.out.println("D " + ids[i] + ", " + counts[i]);
        }
    }

    public IdCount[] getTop(int topN) {
        final TreeSet<IdCount> topList = new TreeSet<IdCount>(new Comparator<IdCount>() {
            public int compare(IdCount a, IdCount b) {
                return (a.count > b.count ? -1 : (a.count == b.count ? (a.id < b.id ? -1 : (a.id == b.id ? 0 : 1)) : 1)); // Descending order
            }
        });
        IdCount least = null;
        for(int i=0; i<uniqueCount; i++) {
            if (topList.size() < topN) {
                topList.add(new IdCount(ids[i], counts[i]));
                least = topList.last();
            } else {
                if (counts[i] > least.count) {
                    topList.remove(least);
                    topList.add(new IdCount(ids[i], counts[i]));
                    least = topList.last();
                }
            }
            if (topList.size() > topN) throw new RuntimeException("Top list too big: " + topList.size());

        }

        IdCount[] topArr = new IdCount[topList.size()];
        topList.toArray(topArr);
        return topArr;
    }

    /**
     * Constructs a distribution from a sorted list of entries.
     * @param sortedIdList
     */
    public DiscreteDistribution(int[] sortedIdList) {
        if (sortedIdList.length == 0) {
            ids = new int[0];
            counts = new short[0];
            return;
        }
        if (sortedIdList.length == 1) {
            ids = new int[] {sortedIdList[0]};
            counts = new short[] {1};
            return;
        }

        /* First count unique */
        uniqueCount = 1;
        for(int i=1; i < sortedIdList.length; i++) {
            if (sortedIdList[i] != sortedIdList[i - 1]) {
                uniqueCount++;
                if (sortedIdList[i] < sortedIdList[i - 1]) {
                    throw new RuntimeException("Not ordered! " + sortedIdList[i] + " < " + sortedIdList[i - 1]);
                }
            }
        }

        ids = new int[uniqueCount];
        counts = new short[uniqueCount];

        /* Create counts */
        int idx = 0;
        short curCount = 1;

        for(int i=1; i < sortedIdList.length; i++) {
            if (sortedIdList[i] != sortedIdList[i - 1]) {
                ids[idx] = sortedIdList[i - 1];
                counts[idx] = curCount;
                idx++;
                curCount = 1;
            }  else curCount++;
        }

        ids[idx] = sortedIdList[sortedIdList.length - 1];
        counts[idx] = curCount;
    }


    public DiscreteDistribution filteredAndShift(int minimumCount) {
         return filteredAndShift((short)minimumCount);
    }

    /**
     * Creates a new distribution with all entries with count less than
     * minimumCount removed, and rest changed by - minimumCount. Does not remove avoids
     */
    public DiscreteDistribution filteredAndShift(short minimumCount) {
        if (minimumCount <= 1) {
            return this;
        }
        int toRemove = 0;
        for(int i=0; i < uniqueCount; i++) {
            toRemove += (counts[i] < minimumCount &&  counts[i] > 0 ? 1 : 0);
        }

        if (toRemove == 0) {
            return this;   // We can safely return same instance, as this is immutable
        }

        DiscreteDistribution filteredDist = new DiscreteDistribution(uniqueCount - toRemove);
        int idx = 0;
        for(int i=0; i < uniqueCount; i++) {
            if (counts[i] >= minimumCount || counts[i] == (-1)) {
                filteredDist.ids[idx] = ids[i];
                if ( counts[i] != (-1)) {
                    filteredDist.counts[idx] = (short) (counts[i] - minimumCount + 1);
                } else {
                    filteredDist.counts[idx] = -1;
                }
                idx++;
            }
        }
        return filteredDist;
    }

    /**
     * Create a special avoidance distribution, where each count is -1.
     * @param avoids  sorted list of ids to avoid
     * @return
     */
    public static DiscreteDistribution createAvoidanceDistribution(int[] avoids) {
        DiscreteDistribution avoidDistr = new DiscreteDistribution(avoids);
        Arrays.fill(avoidDistr.counts, (short) -1);
        return avoidDistr;
    }



    public int getCount(int id) {
        int idx = Arrays.binarySearch(ids, id);
        if (idx >= 0) {
            return counts[idx];
        } else {
            return 0;
        }
    }

    public int size() {
        return uniqueCount;
    }

    public int avoidCount() {
        int a = 0;
        for(int i=0; i < counts.length; i++) a += counts[i] >= 0 ? 0 : 1;
        return a;
    }

    public int memorySizeEst() {
        int OVERHEAD = 64; // Estimate of the java memory overhead
        return 6 * counts.length + 4 + OVERHEAD;
    }




    public int max() {
        int mx = Integer.MIN_VALUE;
        for(int i=0; i < counts.length; i++) {
            if (counts[i] > mx) mx = counts[i];
        }
        return mx;
    }

    /**
     * Merge too distribution. If either has a negative entry for some id, it will
     * remain negative in the merged distribution (see createAvoidDistribution).
     * @param d1
     * @param d2
     * @return
     */
    public static DiscreteDistribution merge(DiscreteDistribution d1, DiscreteDistribution d2) {

        if (d1.uniqueCount == 0) return d2;
        if (d2.uniqueCount == 0) return d1;

        /* Merge style algorithm */

        /* 1. first count number of different pairs */
        int combinedUniqueIndices = 0;
        int leftIdx = 0;
        int rightIdx = 0;
        final int leftCount = d1.size();
        final int rightCount = d2.size();

        while(leftIdx < leftCount && rightIdx < rightCount) {
            if (d1.ids[leftIdx] == d2.ids[rightIdx]) {
                leftIdx++;
                rightIdx++;
                combinedUniqueIndices++;
            } else if (d1.ids[leftIdx] < d2.ids[rightIdx]) {
                combinedUniqueIndices++;
                leftIdx++;
            } else {
                combinedUniqueIndices++;
                rightIdx++;
            }
        }
        combinedUniqueIndices += (rightCount - rightIdx);
        combinedUniqueIndices += (leftCount - leftIdx);

        /* Create new merged distribution by doing the actual merge */
        DiscreteDistribution merged = new DiscreteDistribution(combinedUniqueIndices);
        leftIdx = 0;
        rightIdx = 0;
        int idx = 0;
        while(leftIdx < leftCount && rightIdx < rightCount) {
            if (d1.ids[leftIdx] == d2.ids[rightIdx]) {
                merged.ids[idx] = d1.ids[leftIdx];
                merged.counts[idx] = (short) (d1.counts[leftIdx] + d2.counts[rightIdx]);

                // Force to retain negativity
                if (d1.counts[leftIdx] < 0 || d2.counts[rightIdx] < 0) {
                    merged.counts[idx] = -1;
                }
                leftIdx++;
                rightIdx++;
                idx++;

            } else if (d1.ids[leftIdx] < d2.ids[rightIdx]) {
                merged.ids[idx] = d1.ids[leftIdx];
                merged.counts[idx] = d1.counts[leftIdx];   // Note, retains negativity
                idx++;
                leftIdx++;
            } else {
                merged.ids[idx] = d2.ids[rightIdx];
                merged.counts[idx] = d2.counts[rightIdx];   // Note, retains negativity
                idx++;
                rightIdx++;
            }
        }

        while(leftIdx < leftCount) {
            merged.ids[idx] = d1.ids[leftIdx];
            merged.counts[idx] = d1.counts[leftIdx];
            leftIdx++;
            idx++;
        }

        while(rightIdx < rightCount) {
            merged.ids[idx] = d2.ids[rightIdx];
            merged.counts[idx] = d2.counts[rightIdx];
            rightIdx++;
            idx++;
        }
        return merged;
    }

    public int sizeExcludingAvoids() {
        int j = 0;
        for(int i=0; i < uniqueCount; i++) {
            if (counts[i] > 0) j++;
        }
        return j;
    }
}