CFTree.java example

Explorer

streaminer-master
- src
  - main
    - java
      - org
        streaminer
        stream
        avg
        ExponentialMovingAverage.java
        IAverage.java
        MovingAverage.java
        SimpleEWMA.java
        TEWMA.java
        VariableEWMA.java
        cardinality
        AdaptiveCounting.java
        BJKST.java
        CardinalityMergeException.java
        CountThenEstimate.java
        FlajoletMartin.java
        HyperLogLog.java
        HyperLogLogPlus.java
        IBaseCardinality.java
        IRichCardinality.java
        KMinValues.java
        LinearCounting.java
        LogLog.java
        RegisterSet.java
        change
        AbsoluteChange.java
        AbstractChange.java
        AbstractCusum.java
        CusumNP.java
        RelativeChange.java
        VariationalChange.java
        classifier
        AROWClassifier.java
        AbstractClassifier.java
        AbstractMultiClassifier.java
        BWinnowClassifier.java
        Classifier.java
        MajorityClass.java
        MultiClassPAClassifier.java
        MultiRandomClassifier.java
        PAClassifier.java
        Perceptron.java
        RandomClassifier.java
        SimpleClassifier.java
        WinnowClassifier.java
        bayes
        BoundedBayes.java
        LossyBayes.java
        MultiBayes.java
        MultiLossyBayes.java
        MultiTopkBayes.java
        NaiveBayes.java
        NaiveBayesWOP.java
        StickySamplingBayes.java
        TopKBayes.java
        tree
        BTreeNode.java
        BinaryTreeNode.java
        ChernoffSplitCriterion.java
        ChernoffStatistics.java
        HoeffdingTree.java
        HoeffdingTreeModel.java
        HoeffdingTreeNode.java
        InnerNode.java
        LeafNode.java
        LinearRegression.java
        ModelTree.java
        NodeInfo.java
        QualityCriterion.java
        RTree.java
        RTreeStatistics.java
        Range.java
        RegressionTree.java
        RegressionTreeModel.java
        RegressionTreeNode.java
        RegressionTreeStatistics.java
        SplitCriterion.java
        TreeNode.java
        Visitor.java
        clustering
        IClusterer.java
        KMeans.java
        birch
        CFEntry.java
        CFEntryPair.java
        CFNode.java
        CFTree.java
        data
        Data.java
        DataImpl.java
        DataUtils.java
        entropy
        EntropyExact.java
        EntropySketch.java
        IEntropy.java
        eval
        ConfusionMatrix.java
        TableOfConfusion.java
        frequency
        AMSSketch.java
        BaseFrequency.java
        CCFCSketch.java
        CGT.java
        CountMinSketch.java
        CountMinSketchAlt.java
        CountSketch.java
        FrequencyException.java
        IBaseFrequency.java
        IFrequencyList.java
        IRichFrequency.java
        ISimpleFrequency.java
        ITimeDecayFrequency.java
        LossyCounting.java
        Majority.java
        MisraGries.java
        RealCounting.java
        SimpleTopKCounting.java
        SpaceSaving.java
        StickySampling.java
        TimeDecayCountMinSketch.java
        TimeDecayRealCounting.java
        decay
        DecayFormula.java
        ExpDecayFormula.java
        LinDecayFormula.java
        LogDecayFormula.java
        Quantity.java
        topk
        ConcurrentStreamSummary.java
        Frequent.java
        ITopK.java
        StochasticTopper.java
        StreamSummary.java
        util
        CountEntry.java
        CountEntryWithMaxError.java
        Counter.java
        ISampleSet.java
        SampleSet.java
        ScoredItem.java
        histogram
        spdt
        ArrayBinReservoir.java
        ArrayCategoricalTarget.java
        Bin.java
        BinReservoir.java
        BinUpdateException.java
        CategoricalTarget.java
        Gap.java
        GroupTarget.java
        Histogram.java
        MapCategoricalTarget.java
        MixedInsertException.java
        NumericTarget.java
        SimpleTarget.java
        SumOutOfRangeException.java
        SumResult.java
        Target.java
        TreeBinReservoir.java
        learner
        AbstractRegressor.java
        Learner.java
        LearnerUtils.java
        Regressor.java
        mapper
        AddMax.java
        AddMinMax.java
        AttributeFilter.java
        ExtractDouble.java
        IMapper.java
        Identity.java
        KeyMapper.java
        Normalization.java
        NumericalBinning.java
        SelectFeaturesMapper.java
        membership
        AbstractFilter.java
        BloomCalculations.java
        BloomFilter.java
        BloomFilterAlt.java
        CountingBloomFilter.java
        CuckooFilter.java
        DLeftCountingBloomFilter.java
        DynamicBloomFilter.java
        HashFunction.java
        IFilter.java
        Key.java
        ODTDBloomFilter.java
        RemoveScheme.java
        RetouchedBloomFilter.java
        StableBloomFilter.java
        TimingBloomFilter.java
        VarCountingBloomFilter.java
        misc
        MovingWindowDelta.java
        Rate.java
        Significance.java
        Statistics.java
        model
        DescriptionModel.java
        Distribution.java
        HyperplaneModel.java
        Model.java
        NominalDistributionModel.java
        NumericalDistributionModel.java
        PredictionModel.java
        SelectiveDescriptionModel.java
        SlidingWindow.java
        StreamDistribution.java
        norm
        StableSketch.java
        quantile
        CKMSQuantiles.java
        EnsembleQuantiles.java
        ExactQuantiles.java
        ExactQuantilesAll.java
        Frugal2U.java
        GKQuantiles.java
        GroupTree.java
        IQuantiles.java
        MPQuantiles.java
        QDigest.java
        QuantilesException.java
        SimpleQuantiles.java
        SumQuantiles.java
        TDigest.java
        WindowSketchQuantiles.java
        rss
        Bucket.java
        Interval.java
        RSSQuantiles.java
        Subset.java
        SubsetTree.java
        sampler
        BernoulliSampler.java
        ChainSampler.java
        FrequentSampler.java
        ISampleList.java
        ISampler.java
        L0Sampler.java
        ReservoirSampler.java
        SpaceSavingSampler.java
        SystematicSampler.java
        WRSampler.java
        WeightedRandomSampler.java
        gamma
        GammaFunction.java
        R.java
        Z.java
        sre
        OneSparseRecoveryEstimator.java
        SSparseRecoveryEstimator.java
        wavelet
        HaarWaveletTransform.java
        IWavelet.java
        util
        AbstractIterator.java
        ArrayUtils.java
        Bits.java
        ByteArrayTable.java
        ByteUtil.java
        DoublyLinkedList.java
        ExternalizableUtil.java
        IBuilder.java
        ListNode2.java
        Lists.java
        Pair.java
        Preconditions.java
        QuotedStringTokenizer.java
        SizeOf.java
        UnsignedIntComparator.java
        Varint.java
        distance
        CosineDistance.java
        Kernel.java
        LinearDistance.java
        Radius.java
        SquaredDistance.java
        distribution
        ZipfDistribution.java
        hash
        Hash.java
        HashUtils.java
        JenkinsHash.java
        Lookup3Hash.java
        MurmurHash.java
        MurmurHash3.java
        SpookyHash.java
        SpookyHash32.java
        SpookyHash64.java
        factory
        HashFunctionFactory.java
        SimpleHashFactory.java
        TwoUniversalHashFactory.java
        function
        HashFunction.java
        JenkinsHashFunction.java
        MurmurHashFunction.java
        SimpleHashFunction.java
        TwoUniversalHashFunction.java
        math
        MathHelper.java
        MathUtil.java
        NumberUtil.java
        PowerOfTwo.java
        Prime.java
  - test
    - java
      - org
        streaminer
        stream
        avg
        TEWMATest.java
        benchmark
        CountMinSketchBenchmark.java
        cardinality
        BJKSTTest.java
        FlajoletMartinTest.java
        KMinValuesTest.java
        RegisterSetTest.java
        TestAdaptiveCounting.java
        TestCountThenEstimate.java
        TestHyperLogLog.java
        TestHyperLogLogPlus.java
        TestICardinality.java
        TestLinearCounting.java
        TestLogLog.java
        change
        AbsoluteChangeTest.java
        ExactSolution.java
        StreamGenerator.java
        classifier
        bayes
        NaiveBayesTest.java
        entropy
        EntropySketchTest.java
        frequency
        AMSSketchTest.java
        CCFCSketchTest.java
        CGTTest.java
        CountMinSketchAltTest.java
        SimpleTopKCountingTest.java
        SpaceSavingTest.java
        StreamGenerator.java
        TimeDecayCountMinSketchTest.java
        topk
        TestConcurrentStreamSummary.java
        TestStochasticTopper.java
        TestStreamSummary.java
        util
        TestSampleSet.java
        membership
        BloomFilterAltTest.java
        BloomFilterTest.java
        CountingBloomFilterTest.java
        CuckooFilterTest.java
        DLeftCountingBloomFilterTest.java
        ODTDBloomFilterTest.java
        StableBloomFilterTest.java
        TimingBloomFilterTest.java
        VarCountingBloomFilterTest.java
        model
        NominalDistributionModelTest.java
        norm
        StableSketchTest.java
        quantile
        CKMSQuantilesTest.java
        Frugal2UTest.java
        GroupTreeTest.java
        MPQuantilesTest.java
        QDigestTest.java
        TDigestTest.java
        sampler
        ChainSamplerTest.java
        util
        TestDoublyLinkedList.java
        hash
        Lookup3HashTest.java
        MurmurHashTest.java
        SpookyHash32Test.java
        SpookyHash64Test.java

/*
 *  This file is part of JBIRCH.
 *
 *  JBIRCH is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  JBIRCH is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with JBIRCH.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

/*
 *  CFTree.java
 *  Copyright (C) 2009 Roberto Perdisci (roberto.perdisci@gmail.com)
 */


package org.streaminer.stream.clustering.birch;


import java.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.streaminer.util.SizeOf;

/**
 * This is an implementation of the BIRCH clustering algorithm described in:
 * 
 * T. Zhang, R. Ramakrishnan, and M. Livny.
 * "BIRCH: A New Data Clustering Algorithm and Its Applications"
 * Data Mining and Knowledge Discovery, 1997.
 * 
 * @author Roberto Perdisci (roberto.perdisci@gmail.com)
 * @version 0.1
 *
 */
public class CFTree {
    private static final Logger LOG = LoggerFactory.getLogger(CFTree.class);
	
    /**
     * Used when computing if the tree is reaching memory limit
     */
    private static final double MEM_LIM_FRAC = 10;

    /**
     * Centroid Distance D0
     */
    public static final int D0_DIST = 0;

    /**
     * Centroid distance D1
     */
    public static final int D1_DIST = 1;

    /**
     * Cluster Distance D2
     */
    public static final int D2_DIST = 2;

    /**
     * Cluster Distance D3
     */
    public static final int D3_DIST = 3;

    /**
     * Cluster Distance D4
     */
    public static final int D4_DIST = 4;

    /**
     * The root node of the CFTree
     */
    private CFNode root;

    /**
     * dummy node that points to the list of leaves. used for fast retrieval of final subclusters
     */
    private CFNode leafListStart = null;

    /**
     * keeps count of the instances inserted into the tree
     */
    private int instanceIndex = 0;

    /**
     * if true, the tree is automatically rebuilt every time the memory limit is reached
     */
    private boolean automaticRebuild = false;

    /**
     * the memory limit used when automatic rebuilding is active
     */
    private long memLimit = (long)Math.pow(1024, 3); // default = 1GB

    /**
     * used when automatic rebuilding is active
     */
    private long periodicMemLimitCheck = 100000; // checks if memeory limit is exceeded every 100,000 insertions

    /**
     * 
     * @param maxNodeEntries parameter B
     * @param distThreshold parameter T
     * @param distFunction must be one of CFTree.D0_DIST,...,CFTree.D4_DIST, otherwise it will default to D0_DIST 
     * @param applyMergingRefinement if true, activates merging refinement after each node split
     */
    public CFTree(int maxNodeEntries, double distThreshold, int distFunction, boolean applyMergingRefinement) {
        if(distFunction < D0_DIST || distFunction > D4_DIST)
            distFunction = D0_DIST;

        root = new CFNode(maxNodeEntries,distThreshold,distFunction,applyMergingRefinement,true);
        leafListStart = new CFNode(0,0,distFunction,applyMergingRefinement,true); // this is a dummy node that points to the fist leaf
        leafListStart.setNextLeaf(root); // at this point root is the only node and therefore also the only leaf
    }

    /**
     * 
     * @return the current memory limit used to trigger automatic rebuilding
     */
    public long getMemoryLimit() {
        return memLimit;
    }

    /**
     * Gets the start of the list of leaf nodes (remember: the first node is a dummy node)
     * 
     * @return
     */
    public CFNode getLeafListStart() {
        return this.leafListStart;
    }

    /**
     * 
     * @param limit memory limit in bytes
     */
    public void setMemoryLimit(long limit) {
        this.memLimit = limit;
    }

    /**
     * 
     * @param limit memory limit in Mbytes
     */
    public void setMemoryLimitMB(long limit) {
        this.memLimit = limit*1024*1024;
    }

    /**
     * 
     * @param auto if true, and memory limit is reached, the tree is automatically rebuilt with larger threshold
     */
    public void setAutomaticRebuild(boolean auto) {
        this.automaticRebuild = auto;
    }

    /**
     * 
     * @param period the number of insert operations after which we check whether the tree has reached the memory limit
     */
    public void setPeriodicMemLimitCheck(long period) {
        this.periodicMemLimitCheck = period;
    }

    /**
     * Inserts a single pattern vector into the CFTree
     * 
     * @param x the pattern vector to be inserted in the tree
     * @return true if insertion was successful
     */
    public boolean insertEntry(double[] x) {
        instanceIndex++;

        if (automaticRebuild && (instanceIndex % periodicMemLimitCheck)==0) {
            // rebuilds the tree if we reached or exceeded memory limits
            rebuildIfAboveMemLimit();
        }

        return insertEntry(x,instanceIndex);
    }

    /**
     * Insert a pattern vector with a specific associated pattern vector index.
     * This method does not use periodic memory limit checks.
     * 
     * @param x the pattern vector to be inserted in the tree
     * @param index a specific index associated to the pattern vector x
     * @return true if insertion was successful
     */
    public boolean insertEntry(double[] x, int index) {
        CFEntry e = new CFEntry(x, index);
        return insertEntry(e);
    }

    /**
     * Inserts an entire CFEntry into the tree. Used for tree rebuilding.
     * 
     * @param e the CFEntry to insert
     * @return true if insertion happened without problems
     */
    private boolean insertEntry(CFEntry e) {
        boolean dontSplit = root.insertEntry(e);
        if (!dontSplit) {
            // if dontSplit is false, it means there was not enough space to insert the new entry in the tree, 
            // therefore wee need to split the root to make more room
            splitRoot();

            if (automaticRebuild) {
                // rebuilds the tree if we reached or exceeded memory limits
                rebuildIfAboveMemLimit();
            }
        }

        return true; // after root is split, we are sure x was inserted correctly in the tree, and we return true
    }

    /**
     * Every time we split the root, we check whether the memory limit imposed on the tree
     * has been reached. In this case, we automatically increase the distance threshold and
     * rebuild the tree. 
     * 
     * It is worth noting that since we only check memory consumption only during root split,
     * and not for all node splits (for performance reasons), we cannot guarantee that
     * the memory limit will not be exceeded. The tree may grow significantly between a 
     * root split and the next.
     * Furthermore, the computation of memory consumption using the SizeOf class is only approximate.
     * 
     * Notice also that if the threshold grows to the point that all the entries fall into one entry
     * of the root (i.e., the root is the only node in the tree, and has only one sub-cluster)
     * the automatic rebuild cannot decrease the memory consumption (because increasing the threshold
     * has not effect on reducing the size of the tree), and if Java runs out of memory
     * the program will terminate.
     * 
     * @return true if rebuilt
     */
    private boolean rebuildIfAboveMemLimit() {
        if (hasReachedMemoryLimit(this, memLimit)) {
            LOG.info("Size of Tree is reaching or has exceeded the memory limit");
            LOG.info("Rebuilding the Tree...");
            LOG.info("Current Threshold = " + root.getDistThreshold());
            
            double newThreshold = computeNewThreshold(leafListStart, root.getDistFunction(), root.getDistThreshold());
            LOG.info("New Threshold = " + newThreshold);

            CFTree newTree = this.rebuildTree(root.getMaxNodeEntries(), newThreshold, root.getDistFunction(), root.applyMergingRefinement(), false);
            copyTree(newTree);

            return true;
        }

        return false;
    }

    /**
     * Splits the root to accommodate a new entry. The height of the tree grows by one.
     */
    private void splitRoot() {
        // the split happens by finding the two entries in this node that are the most far apart
        // we then use these two entries as a "pivot" to redistribute the old entries into two new nodes

        CFEntryPair p = root.findFarthestEntryPair(root.getEntries());

        CFEntry newEntry1 = new CFEntry();
        CFNode newNode1 = new CFNode(root.getMaxNodeEntries(),root.getDistThreshold(),root.getDistFunction(),root.applyMergingRefinement(),root.isLeaf());
        newEntry1.setChild(newNode1);

        CFEntry newEntry2 = new CFEntry();
        CFNode newNode2 = new CFNode(root.getMaxNodeEntries(),root.getDistThreshold(),root.getDistFunction(),root.applyMergingRefinement(),root.isLeaf());
        newEntry2.setChild(newNode2);

        // the new root that hosts the new entries
        CFNode newRoot = new CFNode(root.getMaxNodeEntries(),root.getDistThreshold(),root.getDistFunction(),root.applyMergingRefinement(),false);
        newRoot.addToEntryList(newEntry1);
        newRoot.addToEntryList(newEntry2);

        // this updates the pointers to the list of leaves
        if(root.isLeaf()) { // if root was a leaf
            leafListStart.setNextLeaf(newNode1);
            newNode1.setPreviousLeaf(leafListStart);
            newNode1.setNextLeaf(newNode2);
            newNode2.setPreviousLeaf(newNode1);
        }

        // redistributes the entries in the root between newEntry1 and newEntry2
        // according to the distance to p.e1 and p.e2
        root.redistributeEntries(root.getEntries(),p,newEntry1,newEntry2);

        // updates the root
        root = newRoot;

        // frees some memory by deleting the nodes in the tree that had to be split
        System.gc();
    }

    /**
     * Overwrites the structure of this tree (all nodes, entreis, and leaf list) with the structure of newTree.
     * 
     * @param newTree the tree to be copied
     */
    private void copyTree(CFTree newTree) {
        this.root = newTree.root;
        this.leafListStart = newTree.leafListStart;
    }

    /**
     * Computes a new threshold based on the average distance of the closest subclusters in each leaf node
     * 
     * @param leafListStart the pointer to the start of the list (the first node is assumed to be a place-holder dummy node)
     * @param distFunction 
     * @param currentThreshold
     * @return the new threshold
     */
    public double computeNewThreshold(CFNode leafListStart,int distFunction, double currentThreshold) {
        double avgDist = 0;
        int n = 0;

        CFNode l = leafListStart.getNextLeaf();
        while (l!=null) {
            if (!l.isDummy()) {
                CFEntryPair p = l.findClosestEntryPair(l.getEntries());
                if(p!=null) {
                    avgDist += p.e1.distance(p.e2, distFunction);
                    n++;

                    /* This is a possible alternative: Overall avg distance between leaf entries
                    CFEntry[] v = l.getEntries().toArray(new CFEntry[0]);
                    for(int i=0; i < v.length-1; i++) {
                            for(int j=i+1; j < v.length; j++) {
                                    avgDist += v[i].distance(v[j], distFunction);
                                    n++;
                            }
                    }*/
                }
            }
            l = l.getNextLeaf();
        }

        double newThreshold = 0;
        if (n>0)
            newThreshold = avgDist/n;

        if (newThreshold <= currentThreshold) { // this guarantees that newThreshold always increases compared to currentThreshold
            newThreshold = 2*currentThreshold;
        }

        return newThreshold;
    }

    /**
     * True if CFTree's memory occupation exceeds or is almost equal to the memory limit
     * 
     * @param tree the tree to be tested
     * @param limit the memory limit
     * @return true if memory limit has been reached
     */
    private boolean hasReachedMemoryLimit(CFTree tree, long limit) {
        long memory = computeMemorySize(tree);

        LOG.info("Tree Size = " + SizeOf.humanReadable(memory));
        return (memory >= (limit - limit/(double)MEM_LIM_FRAC));
    }

    /**
     * Computes the memory usage of a CFTree
     * 
     * @param t a CFTree
     * @return memory usage in bytes
     */
    private long computeMemorySize(CFTree t) {
        long memSize = 0;
        try {
            memSize = SizeOf.deepSizeOf(t);
        } catch(Exception e) {
            LOG.error("Error when computing memory size", e);
        }
        
        return memSize;
    }

    /**
     * This implementation of the rebuilding algorithm is different from
     * the one described in Section 4.5 of the paper. However the effect
     * is practically the same. Namely, given a tree t_i build using
     * threshold T_i, if we set a new threshold T_(i+1) and call
     * rebuildTree (assuming maxEntries stays the same) we will obtain
     * a more compact tree. 
     * 
     * Since the CFTree is sensitive to the order of the data, there
     * may be cases in which, if we set the T_(i+1) so that non of the
     * sub-clusters (i.e., the leaf entries) can be merged (e.g., T_(i+1)=-1)
     * we might actually obtain a new tree t_(i+1) containing more nodes
     * than t_i. However, the obtained sub-clusters in t_(i+1) will be 
     * identical to the sub-clusters in t_i.
     * 
     * In practice, though, if T_(i+1) > T_(i), the tree t_(i+1) will
     * usually be smaller than t_i.
     * Although the Reducibility Theorem in Section 4.5 may not hold
     * anymore, in practice this will not be a big problem, since 
     * even in those cases in which t_(i+1)>t_i, the growth should
     * be very small.
     * 
     * The advantage is that relaxing the constraint that the size
     * of t_(i+1) must be less than t_i makes the implementation
     * of the rebuilding algorithm much easier.
     * 
     * @param newMaxEntries the new number of entries per node
     * @param newThreshold the new threshold
     * @param applyMergingRefinement if true, merging refinement will be applied after every split
     * @param discardOldTree if true, the old tree will be discarded (to free memory)
     * 
     * @return the new (usually more compact) CFTree
     */
    public CFTree rebuildTree(int newMaxEntries, double newThreshold, int distFunction, boolean applyMergingRefinement, boolean discardOldTree) {
        CFTree newTree = new CFTree(newMaxEntries, newThreshold, distFunction, applyMergingRefinement);
        newTree.instanceIndex = this.instanceIndex;
        newTree.memLimit = this.memLimit;

        CFNode oldLeavesList = this.leafListStart.getNextLeaf(); // remember: the node this.leafListStart is a dummy node (place holder for beginning of leaf list)

        if (discardOldTree) {
            this.root = null;
            System.gc(); // removes the old tree. Only the old leaves will be kept
        }

        CFNode leaf = oldLeavesList;
        while (leaf!=null) {
            if (!leaf.isDummy()) {
                for (CFEntry e : leaf.getEntries()) {
                    CFEntry newE = e;
                    if (!discardOldTree) // we need to make a deep copy of e
                        newE = new CFEntry(e);

                    newTree.insertEntry(newE);
                }
            }

            leaf = leaf.getNextLeaf();
        }

        if (discardOldTree) {
            this.leafListStart = null; 
            System.gc(); // removes the old list of leaves
        }

        return newTree;
    }


    /**
     * 
     * @return a list of subcluster, and for each subcluster a list of pattern vector indexes that belong to it
     */
    public ArrayList<ArrayList<Integer>> getSubclusterMembers() {
        ArrayList<ArrayList<Integer>> membersList = new ArrayList<ArrayList<Integer>>();

        CFNode l = leafListStart.getNextLeaf(); // the first leaf is dummy!
        while (l!=null) {
            if (!l.isDummy()) {
                for(CFEntry e : l.getEntries())
                    membersList.add(e.getIndexList());
            }
            l = l.getNextLeaf();
        }

        return membersList;
    }

    /**
     * Signals the fact that we finished inserting data.
     * The obtained subclusters will be assigned a positive, unique ID number
     */
    public void finishedInsertingData() {
        CFNode l = leafListStart.getNextLeaf(); // the first leaf is dummy!

        int id = 0;
        while (l!=null) {
            if (!l.isDummy()) {
                for(CFEntry e : l.getEntries()) {
                    id++;
                    e.setSubclusterID(id);
                }
            }
            l = l.getNextLeaf();
        }
    }

    /**
     * Retrieves the subcluster id of the closest leaf entry to e
     * 
     * @param e the entry to be mapped
     * @return a positive integer, if the leaf entries were enumerated using finishedInsertingData(), otherwise -1
     */
    public int mapToClosestSubcluster(double[] x) {
        CFEntry e = new CFEntry(x);

        return root.mapToClosestSubcluster(e);
    }

    /**
     * Computes an estimate of the cost of running an O(n^2) algorithm to split each subcluster in more fine-grained clusters
     * 
     * @return sqrt(sum_i[(n_i)^2]), where n_i is the number of members of the i-th subcluster
     */
    public double computeSumLambdaSquared() {
        double lambdaSS = 0;

        CFNode l = leafListStart.getNextLeaf();
        while (l!=null) {
            if (!l.isDummy()) {
                for(CFEntry e : l.getEntries()) {
                    lambdaSS += Math.pow(e.getIndexList().size(), 2);
                }
            }
            l = l.getNextLeaf();
        }

        return Math.sqrt(lambdaSS);
    }

    /**
     * prints the CFTree
     */
    public void printCFTree() {
        System.out.println(root);
    }

    /**
     * Counts the nodes of the tree (including leaves)
     * 
     * @return the number of nodes in the tree
     */
    public int countNodes() {
        int n = 1; // at least root has to be present
        n += root.countChildrenNodes();

        return n;
    }

    /**
     * Counts the number of CFEntries in the tree
     * 
     * @return the number of entries in the tree
     */
    public int countEntries() {
        int n = root.size(); // at least root has to be present
        n += root.countEntriesInChildrenNodes();

        return n;
    }

    /**
     * Counts the number of leaf entries (i.e., the number of sub-clusters in the tree)
     * 
     * @return the number of leaf entries (i.e., the number of sub-clusters)
     */
    public int countLeafEntries() {
        int i=0;
        CFNode l = leafListStart.getNextLeaf();
        while (l!=null) {
            if (!l.isDummy()) {
                i += l.size();
            }

            l = l.getNextLeaf();
        }

        return i;
    }

    /**
     * Prints the index of all the pattern vectors that fall into the leaf nodes.
     * This is only useful for debugging purposes.
     */
    public void printLeafIndexes() {
        ArrayList<Integer> indexes = new ArrayList<Integer>();

        CFNode l = leafListStart.getNextLeaf();
        while (l!=null) {
            if (!l.isDummy()) {
                System.out.println(l);
                for(CFEntry e : l.getEntries())
                    indexes.addAll(e.getIndexList());
            }
            l = l.getNextLeaf();
        }

        Integer[] v = indexes.toArray(new Integer[0]);
        Arrays.sort(v);

        System.out.println("Num of Indexes = " + v.length);
        System.out.println(Arrays.toString(v));
    }

    /**
     * Prints the index of the pattern vectors in each leaf entry (i.e., each subcluster)
     */
    public void printLeafEntries() {
        int i=0;
        CFNode l = leafListStart.getNextLeaf();
        while (l!=null) {
            if (!l.isDummy()) {
                for (CFEntry e : l.getEntries()) {
                    System.out.println("[[" + (++i) +"]]");
                    Integer[] v = e.getIndexList().toArray(new Integer[0]);
                    Arrays.sort(v);
                    System.out.println(Arrays.toString(v));
                }
            }

            l = l.getNextLeaf();
        }
    }
}