/* * This file is part of JBIRCH. * * JBIRCH is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * JBIRCH is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with JBIRCH. If not, see <http://www.gnu.org/licenses/>. * */ /* * CFTree.java * Copyright (C) 2009 Roberto Perdisci (roberto.perdisci@gmail.com) */ package org.streaminer.stream.clustering.birch; import java.util.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.streaminer.util.SizeOf; /** * This is an implementation of the BIRCH clustering algorithm described in: * * T. Zhang, R. Ramakrishnan, and M. Livny. * "BIRCH: A New Data Clustering Algorithm and Its Applications" * Data Mining and Knowledge Discovery, 1997. * * @author Roberto Perdisci (roberto.perdisci@gmail.com) * @version 0.1 * */ public class CFTree { private static final Logger LOG = LoggerFactory.getLogger(CFTree.class); /** * Used when computing if the tree is reaching memory limit */ private static final double MEM_LIM_FRAC = 10; /** * Centroid Distance D0 */ public static final int D0_DIST = 0; /** * Centroid distance D1 */ public static final int D1_DIST = 1; /** * Cluster Distance D2 */ public static final int D2_DIST = 2; /** * Cluster Distance D3 */ public static final int D3_DIST = 3; /** * Cluster Distance D4 */ public static final int D4_DIST = 4; /** * The root node of the CFTree */ private CFNode root; /** * dummy node that points to the list of leaves. used for fast retrieval of final subclusters */ private CFNode leafListStart = null; /** * keeps count of the instances inserted into the tree */ private int instanceIndex = 0; /** * if true, the tree is automatically rebuilt every time the memory limit is reached */ private boolean automaticRebuild = false; /** * the memory limit used when automatic rebuilding is active */ private long memLimit = (long)Math.pow(1024, 3); // default = 1GB /** * used when automatic rebuilding is active */ private long periodicMemLimitCheck = 100000; // checks if memeory limit is exceeded every 100,000 insertions /** * * @param maxNodeEntries parameter B * @param distThreshold parameter T * @param distFunction must be one of CFTree.D0_DIST,...,CFTree.D4_DIST, otherwise it will default to D0_DIST * @param applyMergingRefinement if true, activates merging refinement after each node split */ public CFTree(int maxNodeEntries, double distThreshold, int distFunction, boolean applyMergingRefinement) { if(distFunction < D0_DIST || distFunction > D4_DIST) distFunction = D0_DIST; root = new CFNode(maxNodeEntries,distThreshold,distFunction,applyMergingRefinement,true); leafListStart = new CFNode(0,0,distFunction,applyMergingRefinement,true); // this is a dummy node that points to the fist leaf leafListStart.setNextLeaf(root); // at this point root is the only node and therefore also the only leaf } /** * * @return the current memory limit used to trigger automatic rebuilding */ public long getMemoryLimit() { return memLimit; } /** * Gets the start of the list of leaf nodes (remember: the first node is a dummy node) * * @return */ public CFNode getLeafListStart() { return this.leafListStart; } /** * * @param limit memory limit in bytes */ public void setMemoryLimit(long limit) { this.memLimit = limit; } /** * * @param limit memory limit in Mbytes */ public void setMemoryLimitMB(long limit) { this.memLimit = limit*1024*1024; } /** * * @param auto if true, and memory limit is reached, the tree is automatically rebuilt with larger threshold */ public void setAutomaticRebuild(boolean auto) { this.automaticRebuild = auto; } /** * * @param period the number of insert operations after which we check whether the tree has reached the memory limit */ public void setPeriodicMemLimitCheck(long period) { this.periodicMemLimitCheck = period; } /** * Inserts a single pattern vector into the CFTree * * @param x the pattern vector to be inserted in the tree * @return true if insertion was successful */ public boolean insertEntry(double[] x) { instanceIndex++; if (automaticRebuild && (instanceIndex % periodicMemLimitCheck)==0) { // rebuilds the tree if we reached or exceeded memory limits rebuildIfAboveMemLimit(); } return insertEntry(x,instanceIndex); } /** * Insert a pattern vector with a specific associated pattern vector index. * This method does not use periodic memory limit checks. * * @param x the pattern vector to be inserted in the tree * @param index a specific index associated to the pattern vector x * @return true if insertion was successful */ public boolean insertEntry(double[] x, int index) { CFEntry e = new CFEntry(x, index); return insertEntry(e); } /** * Inserts an entire CFEntry into the tree. Used for tree rebuilding. * * @param e the CFEntry to insert * @return true if insertion happened without problems */ private boolean insertEntry(CFEntry e) { boolean dontSplit = root.insertEntry(e); if (!dontSplit) { // if dontSplit is false, it means there was not enough space to insert the new entry in the tree, // therefore wee need to split the root to make more room splitRoot(); if (automaticRebuild) { // rebuilds the tree if we reached or exceeded memory limits rebuildIfAboveMemLimit(); } } return true; // after root is split, we are sure x was inserted correctly in the tree, and we return true } /** * Every time we split the root, we check whether the memory limit imposed on the tree * has been reached. In this case, we automatically increase the distance threshold and * rebuild the tree. * * It is worth noting that since we only check memory consumption only during root split, * and not for all node splits (for performance reasons), we cannot guarantee that * the memory limit will not be exceeded. The tree may grow significantly between a * root split and the next. * Furthermore, the computation of memory consumption using the SizeOf class is only approximate. * * Notice also that if the threshold grows to the point that all the entries fall into one entry * of the root (i.e., the root is the only node in the tree, and has only one sub-cluster) * the automatic rebuild cannot decrease the memory consumption (because increasing the threshold * has not effect on reducing the size of the tree), and if Java runs out of memory * the program will terminate. * * @return true if rebuilt */ private boolean rebuildIfAboveMemLimit() { if (hasReachedMemoryLimit(this, memLimit)) { LOG.info("Size of Tree is reaching or has exceeded the memory limit"); LOG.info("Rebuilding the Tree..."); LOG.info("Current Threshold = " + root.getDistThreshold()); double newThreshold = computeNewThreshold(leafListStart, root.getDistFunction(), root.getDistThreshold()); LOG.info("New Threshold = " + newThreshold); CFTree newTree = this.rebuildTree(root.getMaxNodeEntries(), newThreshold, root.getDistFunction(), root.applyMergingRefinement(), false); copyTree(newTree); return true; } return false; } /** * Splits the root to accommodate a new entry. The height of the tree grows by one. */ private void splitRoot() { // the split happens by finding the two entries in this node that are the most far apart // we then use these two entries as a "pivot" to redistribute the old entries into two new nodes CFEntryPair p = root.findFarthestEntryPair(root.getEntries()); CFEntry newEntry1 = new CFEntry(); CFNode newNode1 = new CFNode(root.getMaxNodeEntries(),root.getDistThreshold(),root.getDistFunction(),root.applyMergingRefinement(),root.isLeaf()); newEntry1.setChild(newNode1); CFEntry newEntry2 = new CFEntry(); CFNode newNode2 = new CFNode(root.getMaxNodeEntries(),root.getDistThreshold(),root.getDistFunction(),root.applyMergingRefinement(),root.isLeaf()); newEntry2.setChild(newNode2); // the new root that hosts the new entries CFNode newRoot = new CFNode(root.getMaxNodeEntries(),root.getDistThreshold(),root.getDistFunction(),root.applyMergingRefinement(),false); newRoot.addToEntryList(newEntry1); newRoot.addToEntryList(newEntry2); // this updates the pointers to the list of leaves if(root.isLeaf()) { // if root was a leaf leafListStart.setNextLeaf(newNode1); newNode1.setPreviousLeaf(leafListStart); newNode1.setNextLeaf(newNode2); newNode2.setPreviousLeaf(newNode1); } // redistributes the entries in the root between newEntry1 and newEntry2 // according to the distance to p.e1 and p.e2 root.redistributeEntries(root.getEntries(),p,newEntry1,newEntry2); // updates the root root = newRoot; // frees some memory by deleting the nodes in the tree that had to be split System.gc(); } /** * Overwrites the structure of this tree (all nodes, entreis, and leaf list) with the structure of newTree. * * @param newTree the tree to be copied */ private void copyTree(CFTree newTree) { this.root = newTree.root; this.leafListStart = newTree.leafListStart; } /** * Computes a new threshold based on the average distance of the closest subclusters in each leaf node * * @param leafListStart the pointer to the start of the list (the first node is assumed to be a place-holder dummy node) * @param distFunction * @param currentThreshold * @return the new threshold */ public double computeNewThreshold(CFNode leafListStart,int distFunction, double currentThreshold) { double avgDist = 0; int n = 0; CFNode l = leafListStart.getNextLeaf(); while (l!=null) { if (!l.isDummy()) { CFEntryPair p = l.findClosestEntryPair(l.getEntries()); if(p!=null) { avgDist += p.e1.distance(p.e2, distFunction); n++; /* This is a possible alternative: Overall avg distance between leaf entries CFEntry[] v = l.getEntries().toArray(new CFEntry[0]); for(int i=0; i < v.length-1; i++) { for(int j=i+1; j < v.length; j++) { avgDist += v[i].distance(v[j], distFunction); n++; } }*/ } } l = l.getNextLeaf(); } double newThreshold = 0; if (n>0) newThreshold = avgDist/n; if (newThreshold <= currentThreshold) { // this guarantees that newThreshold always increases compared to currentThreshold newThreshold = 2*currentThreshold; } return newThreshold; } /** * True if CFTree's memory occupation exceeds or is almost equal to the memory limit * * @param tree the tree to be tested * @param limit the memory limit * @return true if memory limit has been reached */ private boolean hasReachedMemoryLimit(CFTree tree, long limit) { long memory = computeMemorySize(tree); LOG.info("Tree Size = " + SizeOf.humanReadable(memory)); return (memory >= (limit - limit/(double)MEM_LIM_FRAC)); } /** * Computes the memory usage of a CFTree * * @param t a CFTree * @return memory usage in bytes */ private long computeMemorySize(CFTree t) { long memSize = 0; try { memSize = SizeOf.deepSizeOf(t); } catch(Exception e) { LOG.error("Error when computing memory size", e); } return memSize; } /** * This implementation of the rebuilding algorithm is different from * the one described in Section 4.5 of the paper. However the effect * is practically the same. Namely, given a tree t_i build using * threshold T_i, if we set a new threshold T_(i+1) and call * rebuildTree (assuming maxEntries stays the same) we will obtain * a more compact tree. * * Since the CFTree is sensitive to the order of the data, there * may be cases in which, if we set the T_(i+1) so that non of the * sub-clusters (i.e., the leaf entries) can be merged (e.g., T_(i+1)=-1) * we might actually obtain a new tree t_(i+1) containing more nodes * than t_i. However, the obtained sub-clusters in t_(i+1) will be * identical to the sub-clusters in t_i. * * In practice, though, if T_(i+1) > T_(i), the tree t_(i+1) will * usually be smaller than t_i. * Although the Reducibility Theorem in Section 4.5 may not hold * anymore, in practice this will not be a big problem, since * even in those cases in which t_(i+1)>t_i, the growth should * be very small. * * The advantage is that relaxing the constraint that the size * of t_(i+1) must be less than t_i makes the implementation * of the rebuilding algorithm much easier. * * @param newMaxEntries the new number of entries per node * @param newThreshold the new threshold * @param applyMergingRefinement if true, merging refinement will be applied after every split * @param discardOldTree if true, the old tree will be discarded (to free memory) * * @return the new (usually more compact) CFTree */ public CFTree rebuildTree(int newMaxEntries, double newThreshold, int distFunction, boolean applyMergingRefinement, boolean discardOldTree) { CFTree newTree = new CFTree(newMaxEntries, newThreshold, distFunction, applyMergingRefinement); newTree.instanceIndex = this.instanceIndex; newTree.memLimit = this.memLimit; CFNode oldLeavesList = this.leafListStart.getNextLeaf(); // remember: the node this.leafListStart is a dummy node (place holder for beginning of leaf list) if (discardOldTree) { this.root = null; System.gc(); // removes the old tree. Only the old leaves will be kept } CFNode leaf = oldLeavesList; while (leaf!=null) { if (!leaf.isDummy()) { for (CFEntry e : leaf.getEntries()) { CFEntry newE = e; if (!discardOldTree) // we need to make a deep copy of e newE = new CFEntry(e); newTree.insertEntry(newE); } } leaf = leaf.getNextLeaf(); } if (discardOldTree) { this.leafListStart = null; System.gc(); // removes the old list of leaves } return newTree; } /** * * @return a list of subcluster, and for each subcluster a list of pattern vector indexes that belong to it */ public ArrayList<ArrayList<Integer>> getSubclusterMembers() { ArrayList<ArrayList<Integer>> membersList = new ArrayList<ArrayList<Integer>>(); CFNode l = leafListStart.getNextLeaf(); // the first leaf is dummy! while (l!=null) { if (!l.isDummy()) { for(CFEntry e : l.getEntries()) membersList.add(e.getIndexList()); } l = l.getNextLeaf(); } return membersList; } /** * Signals the fact that we finished inserting data. * The obtained subclusters will be assigned a positive, unique ID number */ public void finishedInsertingData() { CFNode l = leafListStart.getNextLeaf(); // the first leaf is dummy! int id = 0; while (l!=null) { if (!l.isDummy()) { for(CFEntry e : l.getEntries()) { id++; e.setSubclusterID(id); } } l = l.getNextLeaf(); } } /** * Retrieves the subcluster id of the closest leaf entry to e * * @param e the entry to be mapped * @return a positive integer, if the leaf entries were enumerated using finishedInsertingData(), otherwise -1 */ public int mapToClosestSubcluster(double[] x) { CFEntry e = new CFEntry(x); return root.mapToClosestSubcluster(e); } /** * Computes an estimate of the cost of running an O(n^2) algorithm to split each subcluster in more fine-grained clusters * * @return sqrt(sum_i[(n_i)^2]), where n_i is the number of members of the i-th subcluster */ public double computeSumLambdaSquared() { double lambdaSS = 0; CFNode l = leafListStart.getNextLeaf(); while (l!=null) { if (!l.isDummy()) { for(CFEntry e : l.getEntries()) { lambdaSS += Math.pow(e.getIndexList().size(), 2); } } l = l.getNextLeaf(); } return Math.sqrt(lambdaSS); } /** * prints the CFTree */ public void printCFTree() { System.out.println(root); } /** * Counts the nodes of the tree (including leaves) * * @return the number of nodes in the tree */ public int countNodes() { int n = 1; // at least root has to be present n += root.countChildrenNodes(); return n; } /** * Counts the number of CFEntries in the tree * * @return the number of entries in the tree */ public int countEntries() { int n = root.size(); // at least root has to be present n += root.countEntriesInChildrenNodes(); return n; } /** * Counts the number of leaf entries (i.e., the number of sub-clusters in the tree) * * @return the number of leaf entries (i.e., the number of sub-clusters) */ public int countLeafEntries() { int i=0; CFNode l = leafListStart.getNextLeaf(); while (l!=null) { if (!l.isDummy()) { i += l.size(); } l = l.getNextLeaf(); } return i; } /** * Prints the index of all the pattern vectors that fall into the leaf nodes. * This is only useful for debugging purposes. */ public void printLeafIndexes() { ArrayList<Integer> indexes = new ArrayList<Integer>(); CFNode l = leafListStart.getNextLeaf(); while (l!=null) { if (!l.isDummy()) { System.out.println(l); for(CFEntry e : l.getEntries()) indexes.addAll(e.getIndexList()); } l = l.getNextLeaf(); } Integer[] v = indexes.toArray(new Integer[0]); Arrays.sort(v); System.out.println("Num of Indexes = " + v.length); System.out.println(Arrays.toString(v)); } /** * Prints the index of the pattern vectors in each leaf entry (i.e., each subcluster) */ public void printLeafEntries() { int i=0; CFNode l = leafListStart.getNextLeaf(); while (l!=null) { if (!l.isDummy()) { for (CFEntry e : l.getEntries()) { System.out.println("[[" + (++i) +"]]"); Integer[] v = e.getIndexList().toArray(new Integer[0]); Arrays.sort(v); System.out.println(Arrays.toString(v)); } } l = l.getNextLeaf(); } } }