/* * This file is part of JBIRCH. * * JBIRCH is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * JBIRCH is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with JBIRCH. If not, see <http://www.gnu.org/licenses/>. * */ /* * CFNode.java * Copyright (C) 2009 Roberto Perdisci (roberto.perdisci@gmail.com) */ package org.streaminer.stream.clustering.birch; import java.util.*; /** * * @author Roberto Perdisci (roberto.perdisci@gmail.com) * */ public class CFEntry { private static final String LINE_SEP = System.getProperty("line.separator"); private int n = 0; // number of patterns summarized by this entry private double[] sumX = null; private double[] sumX2 = null; private CFNode child = null; private ArrayList<Integer> indexList = null; private int subclusterID = -1; // the unique id the describes a subcluster (valid only for leaf entries) public CFEntry() { } public CFEntry(double[] x) { this(x,0); } public CFEntry(double[] x, int index) { this.n = 1; this.sumX = new double[x.length]; for(int i=0; i<sumX.length; i++) sumX[i] = x[i]; this.sumX2 = new double[x.length]; for(int i=0; i<sumX2.length; i++) sumX2[i] = x[i]*x[i]; indexList = new ArrayList<Integer>(); indexList.add(index); } /** * This makes a deep copy of the CFEntry e. * WARNING: we do not make a deep copy of the child!!! * * @param e the entry to be cloned */ public CFEntry(CFEntry e) { this.n = e.n; this.sumX = e.sumX.clone(); this.sumX2 = e.sumX2.clone(); this.child = e.child; // WARNING: we do not make a deep copy of the child!!! this.indexList = new ArrayList<Integer>(); for(int i : e.getIndexList()) // this makes sure we get a deep copy of the indexList this.indexList.add(i); } protected ArrayList<Integer> getIndexList() { return indexList; } protected boolean hasChild() { return(child!=null); } protected CFNode getChild() { return child; } protected int getChildSize() { return child.getEntries().size(); } protected void setChild(CFNode n) { child = n; indexList = null; // we don't keep this if this becomes a non-leaf entry } protected void setSubclusterID(int id) { subclusterID = id; } protected int getSubclusterID() { return subclusterID; } protected void update(CFEntry e) { this.n += e.n; if(this.sumX==null) this.sumX = e.sumX.clone(); else { for(int i=0; i<sumX.length; i++) this.sumX[i] += e.sumX[i]; } if(this.sumX2==null) this.sumX2 = e.sumX2.clone(); else { for(int i=0; i<sumX2.length; i++) this.sumX2[i] += e.sumX2[i]; } if(!this.hasChild()) { // we keep indexList only if we are at a leaf if(this.indexList!=null && e.indexList!=null) this.indexList.addAll(e.indexList); else if(this.indexList==null && e.indexList!=null) this.indexList = (ArrayList<Integer>)e.indexList.clone(); } } protected void addToChild(CFEntry e) { // adds directly to the child node child.getEntries().add(e); } protected boolean isWithinThreshold(CFEntry e, double threshold, int distFunction) { double dist = distance(e, distFunction); // System.out.println("Distance = " + dist); if(dist==0 || dist<=threshold) // read the comments in function d0() about differences with implementation in R return true; return false; } /** * * @param e * @return the distance between this entry and e */ protected double distance(CFEntry e, int distFunction) { double dist = Double.MAX_VALUE; switch(distFunction) { case CFTree.D0_DIST: dist = d0(this,e); break; case CFTree.D1_DIST: dist = d1(this,e); break; case CFTree.D2_DIST: dist = d2(this,e); break; case CFTree.D3_DIST: dist = d3(this,e); break; case CFTree.D4_DIST: dist = d4(this,e); break; } return dist; } private double d0(CFEntry e1, CFEntry e2) { double dist = 0; for(int i=0; i<e1.sumX.length; i++) { double diff = e1.sumX[i]/e1.n - e2.sumX[i]/e2.n; dist += diff*diff; } if(dist<0) System.err.println("d0 < 0 !!!"); // notice here that in the R implementation of BIRCH (package birch) // // the radius parameter is based on the squared distance /dist/ // this causes a difference in results. // if we change the line below into // return dist; // the results produced by the R implementation and this Java implementation // will match perfectly (notice that in the R implementation maxEntries = 100 // and merging refinement is not implemented) return Math.sqrt(dist); } private double d1(CFEntry e1, CFEntry e2) { double dist = 0; for(int i=0; i<e1.sumX.length; i++) { double diff = Math.abs(e1.sumX[i]/e1.n - e2.sumX[i]/e2.n); dist += diff; } if(dist<0) System.err.println("d1 < 0 !!!"); return dist; } private double d2(CFEntry e1, CFEntry e2) { double dist = 0; int n1 = e1.n; int n2 = e2.n; for(int i=0; i<e1.sumX.length; i++) { double diff = (n2*e1.sumX2[i] - 2*e1.sumX[i]*e2.sumX[i] + n1*e2.sumX2[i])/(n1 * n2); dist += diff; } if(dist<0) System.err.println("d2 < 0 !!!"); return Math.sqrt(dist); } private double d3(CFEntry e1, CFEntry e2) { double dist = 0; int n1 = e1.n; int n2 = e2.n; double[] totSumX = e1.sumX.clone(); double[] totSumX2 = e1.sumX2.clone(); for(int i=0; i<e2.sumX.length; i++) { totSumX[i] += e2.sumX[i]; totSumX2[i] += e2.sumX2[i]; } for(int i=0; i<totSumX.length; i++) { double diff = ((n1+n2)*totSumX2[i] - 2*totSumX[i]*totSumX[i] + (n1+n2)*totSumX2[i])/((n1+n2)*(n1+n2-1)); dist += diff; } if(dist<0) System.err.println("d3 < 0 !!!"); return Math.sqrt(dist); } private double d4(CFEntry e1, CFEntry e2) { double dist = 0; int n1 = e1.n; int n2 = e2.n; double[] totSumX = e1.sumX.clone(); double[] totSumX2 = e1.sumX2.clone(); for(int i=0; i<e2.sumX.length; i++) { totSumX[i] += e2.sumX[i]; totSumX2[i] += e2.sumX2[i]; } for(int i=0; i<totSumX.length; i++) { double diff1 = totSumX2[i] - 2*totSumX[i]*totSumX[i]/(n1+n2) + (n1+n2)*(totSumX[i]/(n1+n2))*(totSumX[i]/(n1+n2)); double diff2 = e1.sumX2[i] - 2*e1.sumX[i]*e1.sumX[i]/n1 + n1*(e1.sumX[i]/n1)*(e1.sumX[i]/n1); double diff3 = e2.sumX2[i] - 2*e2.sumX[i]*e2.sumX[i]/n2 + n2*(e2.sumX[i]/n2)*(e2.sumX[i]/n2); dist += diff1 - diff2 - diff3; } if(dist<0) System.err.println("d4 < 0 !!!"); return Math.sqrt(dist); } public boolean equals(Object o) { CFEntry e = (CFEntry)o; if(this.n != e.n) return false; if(this.child!=null && e.child==null) return false; if(this.child==null && e.child!=null) return false; if(this.child!=null && !this.child.equals(e.child)) return false; if(this.indexList==null && e.indexList!=null) return false; if(this.indexList!=null && e.indexList==null) return false; if(!Arrays.equals(this.sumX, e.sumX)) return false; if(!Arrays.equals(this.sumX2, e.sumX2)) return false; if(this.indexList!=null && !this.indexList.equals(e.indexList)) return false; return true; } public String toString() { StringBuffer buff = new StringBuffer(); buff.append(" "); for(int i=0; i<sumX.length; i++) buff.append(sumX[i]/n + " "); if(this.indexList!=null) { buff.append("( "); for(int i : indexList) { buff.append(i+" "); } buff.append(")"); } if(this.hasChild()) { buff.append(LINE_SEP); buff.append("||" + LINE_SEP); buff.append("||" + LINE_SEP); buff.append(this.getChild()); } return buff.toString(); } }