/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * Copyright (C) 2008 * & Alexander Smirnov (austellus@gmail.com) */ package weka.clusterers; import java.io.Serializable; import java.util.ArrayList; import java.util.Enumeration; import java.util.HashMap; import java.util.Vector; import weka.core.Capabilities; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.OptionHandler; import weka.core.RevisionUtils; import weka.core.SparseInstance; import weka.core.TechnicalInformation; import weka.core.TechnicalInformationHandler; import weka.core.Utils; import weka.core.Capabilities.Capability; import weka.core.TechnicalInformation.Field; import weka.core.TechnicalInformation.Type; /** <!-- globalinfo-start --> * Yiling Yang, Xudong Guan, Jinyuan You: CLOPE: a fast and effective clustering algorithm for transactional data. In: Proceedings of the eighth ACM SIGKDD international conference on Knowledge discovery and data mining, 682-687, 2002. * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * @inproceedings{Yang2002, * author = {Yiling Yang and Xudong Guan and Jinyuan You}, * booktitle = {Proceedings of the eighth ACM SIGKDD international conference on Knowledge discovery and data mining}, * pages = {682-687}, * publisher = {ACM New York, NY, USA}, * title = {CLOPE: a fast and effective clustering algorithm for transactional data}, * year = {2002} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -R <num> * Repulsion * (default 2.6)</pre> * <!-- options-end --> * * @author Alexander Smirnov (austellus@gmail.com) * @version $Revision: 5538 $ */ public class CLOPE extends AbstractClusterer implements OptionHandler, TechnicalInformationHandler { /** for serialization */ static final long serialVersionUID = -567567567567588L; /** * Inner class for cluster of CLOPE. * * @see Serializable */ private class CLOPECluster implements Serializable { /** * Number of transactions */ public int N = 0; //number of transactions /** * Number of distinct items (or width) */ public int W = 0; /** * Size of cluster */ public int S = 0; /** * Hash of <item, occurrence> pairs */ public HashMap occ = new HashMap(); /** * Add item to cluster */ public void AddItem(String Item) { int count; if (!this.occ.containsKey(Item)) { this.occ.put(Item, 1); } else { count = (Integer) this.occ.get(Item); count++; this.occ.remove(Item); this.occ.put(Item, count); } this.S++; } public void AddItem(Integer Item) { int count; if (!this.occ.containsKey(Item)) { this.occ.put(Item, 1); } else { count = (Integer) this.occ.get(Item); count++; this.occ.remove(Item); this.occ.put(Item, count); } this.S++; } /** * Delete item from cluster */ public void DeleteItem(String Item) { int count; count = (Integer) this.occ.get(Item); if (count == 1) { this.occ.remove(Item); } else { count--; this.occ.remove(Item); this.occ.put(Item, count); } this.S--; } public void DeleteItem(Integer Item) { int count; count = (Integer) this.occ.get(Item); if (count == 1) { this.occ.remove(Item); } else { count--; this.occ.remove(Item); this.occ.put(Item, count); } this.S--; } /** * Calculate Delta */ public double DeltaAdd(Instance inst, double r) { //System.out.println("DeltaAdd"); int S_new; int W_new; double profit; double profit_new; double deltaprofit; S_new = 0; W_new = occ.size(); if (inst instanceof SparseInstance) { //System.out.println("DeltaAddSparceInstance"); for (int i = 0; i < inst.numValues(); i++) { S_new++; if ((Integer) this.occ.get(inst.index(i)) == null) { W_new++; } } } else { for (int i = 0; i < inst.numAttributes(); i++) { if (!inst.isMissing(i)) { S_new++; if ((Integer) this.occ.get(i + inst.toString(i)) == null) { W_new++; } } } } S_new += S; if (N == 0) { deltaprofit = S_new / Math.pow(W_new, r); } else { profit = S * N / Math.pow(W, r); profit_new = S_new * (N + 1) / Math.pow(W_new, r); deltaprofit = profit_new - profit; } return deltaprofit; } /** * Add instance to cluster */ public void AddInstance(Instance inst) { if (inst instanceof SparseInstance) { // System.out.println("AddSparceInstance"); for (int i = 0; i < inst.numValues(); i++) { AddItem(inst.index(i)); // for(int i=0;i<inst.numAttributes();int++){ // AddItem(inst.index(i)+inst.value(i)); } } else { for (int i = 0; i < inst.numAttributes(); i++) { if (!inst.isMissing(i)) { AddItem(i + inst.toString(i)); } } } this.W = this.occ.size(); this.N++; } /** * Delete instance from cluster */ public void DeleteInstance(Instance inst) { if (inst instanceof SparseInstance) { // System.out.println("DeleteSparceInstance"); for (int i = 0; i < inst.numValues(); i++) { DeleteItem(inst.index(i)); } } else { for (int i = 0; i <= inst.numAttributes() - 1; i++) { if (!inst.isMissing(i)) { DeleteItem(i + inst.toString(i)); } } } this.W = this.occ.size(); this.N--; } } /** * Array of clusters */ public ArrayList<CLOPECluster> clusters = new ArrayList<CLOPECluster>(); /** * Specifies the repulsion default */ protected double m_RepulsionDefault = 2.6; /** * Specifies the repulsion */ protected double m_Repulsion = m_RepulsionDefault; /** * Number of clusters */ protected int m_numberOfClusters = -1; /** * Counter for the processed instances */ protected int m_processed_InstanceID; /** * Number of instances */ protected int m_numberOfInstances; /** * */ protected ArrayList<Integer> m_clusterAssignments = new ArrayList(); /** * whether the number of clusters was already determined */ protected boolean m_numberOfClustersDetermined = false; public int numberOfClusters() { determineNumberOfClusters(); return m_numberOfClusters; } protected void determineNumberOfClusters() { m_numberOfClusters = clusters.size(); m_numberOfClustersDetermined = true; } public Enumeration listOptions() { Vector result = new Vector(); result.addElement(new Option( "\tRepulsion\n" + "\t(default " + m_RepulsionDefault + ")", "R", 1, "-R <num>")); return result.elements(); } /** * Parses a given list of options. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -R <num> * Repulsion * (default 2.6)</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String tmpStr; tmpStr = Utils.getOption('R', options); if (tmpStr.length() != 0) { setRepulsion(Double.parseDouble(tmpStr)); } else { setRepulsion(m_RepulsionDefault); } } /** * Gets the current settings of CLOPE * * @return an array of strings suitable for passing to setOptions() */ public String[] getOptions() { Vector result; result = new Vector(); result.add("-R"); result.add("" + getRepulsion()); return (String[]) result.toArray(new String[result.size()]); } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String repulsionTipText() { return "Repulsion to be used."; } /** * set the repulsion * * @param value the repulsion * @throws Exception if number of clusters is negative */ public void setRepulsion(double value) { m_Repulsion = value; } /** * gets the repulsion * * @return the repulsion */ public double getRepulsion() { return m_Repulsion; } /** * Returns default capabilities of the clusterer. * * @return the capabilities of this clusterer */ public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); result.enable(Capability.NO_CLASS); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); // result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); return result; } /** * Generate Clustering via CLOPE * @param instances The instances that need to be clustered * @throws java.lang.Exception If clustering was not successful */ public void buildClusterer(Instances data) throws Exception { clusters.clear(); m_processed_InstanceID = 0; m_clusterAssignments.clear(); m_numberOfInstances = data.numInstances(); boolean moved; //Phase 1 for (int i = 0; i < data.numInstances(); i++) { int clusterid = AddInstanceToBestCluster(data.instance(i)); m_clusterAssignments.add(clusterid); } //Phase 2 do { moved = false; for (int i = 0; i < data.numInstances(); i++) { m_processed_InstanceID = i; int clusterid = MoveInstanceToBestCluster(data.instance(i)); if (clusterid != m_clusterAssignments.get(i)) { moved = true; m_clusterAssignments.set(i, clusterid); } } } while (!moved); m_processed_InstanceID = 0; } /** * the default constructor */ public CLOPE() { super(); } /** * Add instance to best cluster */ public int AddInstanceToBestCluster(Instance inst) { double delta; double deltamax; int clustermax = -1; if (clusters.size() > 0) { int tempS = 0; int tempW = 0; if (inst instanceof SparseInstance) { for (int i = 0; i < inst.numValues(); i++) { tempS++; tempW++; } } else { for (int i = 0; i < inst.numAttributes(); i++) { if (!inst.isMissing(i)) { tempS++; tempW++; } } } deltamax = tempS / Math.pow(tempW, m_Repulsion); for (int i = 0; i < clusters.size(); i++) { CLOPECluster tempcluster = clusters.get(i); delta = tempcluster.DeltaAdd(inst, m_Repulsion); // System.out.println("delta " + delta); if (delta > deltamax) { deltamax = delta; clustermax = i; } } } else { CLOPECluster newcluster = new CLOPECluster(); clusters.add(newcluster); newcluster.AddInstance(inst); return clusters.size() - 1; } if (clustermax == -1) { CLOPECluster newcluster = new CLOPECluster(); clusters.add(newcluster); newcluster.AddInstance(inst); return clusters.size() - 1; } clusters.get(clustermax).AddInstance(inst); return clustermax; } /** * Move instance to best cluster */ public int MoveInstanceToBestCluster(Instance inst) { clusters.get(m_clusterAssignments.get(m_processed_InstanceID)).DeleteInstance(inst); m_clusterAssignments.set(m_processed_InstanceID, -1); double delta; double deltamax; int clustermax = -1; int tempS = 0; int tempW = 0; if (inst instanceof SparseInstance) { for (int i = 0; i < inst.numValues(); i++) { tempS++; tempW++; } } else { for (int i = 0; i < inst.numAttributes(); i++) { if (!inst.isMissing(i)) { tempS++; tempW++; } } } deltamax = tempS / Math.pow(tempW, m_Repulsion); for (int i = 0; i < clusters.size(); i++) { CLOPECluster tempcluster = clusters.get(i); delta = tempcluster.DeltaAdd(inst, m_Repulsion); // System.out.println("delta " + delta); if (delta > deltamax) { deltamax = delta; clustermax = i; } } if (clustermax == -1) { CLOPECluster newcluster = new CLOPECluster(); clusters.add(newcluster); newcluster.AddInstance(inst); return clusters.size() - 1; } clusters.get(clustermax).AddInstance(inst); return clustermax; } /** * Classifies a given instance. * * @param instance The instance to be assigned to a cluster * @return int The number of the assigned cluster as an integer * @throws java.lang.Exception If instance could not be clustered * successfully */ public int clusterInstance(Instance instance) throws Exception { if (m_processed_InstanceID >= m_numberOfInstances) { m_processed_InstanceID = 0; } int i = m_clusterAssignments.get(m_processed_InstanceID); m_processed_InstanceID++; return i; } /** * return a string describing this clusterer * * @return a description of the clusterer as a string */ public String toString() { StringBuffer stringBuffer = new StringBuffer(); stringBuffer.append("CLOPE clustering results\n" + "========================================================================================\n\n"); stringBuffer.append("Clustered instances: " + m_clusterAssignments.size() + "\n"); return stringBuffer.toString() + "\n"; } /** * Returns a string describing this DataMining-Algorithm * @return String Information for the gui-explorer */ public String globalInfo() { return getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing * detailed information about the technical background of this class, * e.g., paper reference or book this class is based on. * * @return the technical information about this class */ public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "Yiling Yang and Xudong Guan and Jinyuan You"); result.setValue(Field.TITLE, "CLOPE: a fast and effective clustering algorithm for transactional data"); result.setValue(Field.BOOKTITLE, "Proceedings of the eighth ACM SIGKDD international conference on Knowledge discovery and data mining"); result.setValue(Field.YEAR, "2002"); result.setValue(Field.PAGES, "682-687"); result.setValue(Field.PUBLISHER, "ACM New York, NY, USA"); return result; } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 5538 $"); } /** * Main method for testing this class. * * @param argv should contain the following arguments: <p> * -t training file [-R repulsion] */ public static void main(String[] argv) { runClusterer(new CLOPE(), argv); } }