/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * CheckClusterer.java * Copyright (C) 2006-2012 University of Waikato, Hamilton, New Zealand * */ package weka.clusterers; import java.util.Enumeration; import java.util.Random; import java.util.Vector; import weka.core.CheckScheme; import weka.core.FastVector; import weka.core.Instance; import weka.core.Instances; import weka.core.MultiInstanceCapabilitiesHandler; import weka.core.Option; import weka.core.OptionHandler; import weka.core.RevisionUtils; import weka.core.SerializationHelper; import weka.core.TestInstances; import weka.core.Utils; import weka.core.WeightedInstancesHandler; /** * Class for examining the capabilities and finding problems with * clusterers. If you implement a clusterer using the WEKA.libraries, * you should run the checks on it to ensure robustness and correct * operation. Passing all the tests of this object does not mean * bugs in the clusterer don't exist, but this will help find some * common ones. <p/> * * Typical usage: <p/> * <code>java weka.clusterers.CheckClusterer -W clusterer_name * -- clusterer_options </code><p/> * * CheckClusterer reports on the following: * <ul> * <li> Clusterer abilities * <ul> * <li> Possible command line options to the clusterer </li> * <li> Whether the clusterer can predict nominal, numeric, string, * date or relational class attributes.</li> * <li> Whether the clusterer can handle numeric predictor attributes </li> * <li> Whether the clusterer can handle nominal predictor attributes </li> * <li> Whether the clusterer can handle string predictor attributes </li> * <li> Whether the clusterer can handle date predictor attributes </li> * <li> Whether the clusterer can handle relational predictor attributes </li> * <li> Whether the clusterer can handle multi-instance data </li> * <li> Whether the clusterer can handle missing predictor values </li> * <li> Whether the clusterer can handle instance weights </li> * </ul> * </li> * <li> Correct functioning * <ul> * <li> Correct initialisation during buildClusterer (i.e. no result * changes when buildClusterer called repeatedly) </li> * <li> Whether the clusterer alters the data pased to it * (number of instances, instance order, instance weights, etc) </li> * </ul> * </li> * <li> Degenerate cases * <ul> * <li> building clusterer with zero training instances </li> * <li> all but one predictor attribute values missing </li> * <li> all predictor attribute values missing </li> * <li> all but one class values missing </li> * <li> all class values missing </li> * </ul> * </li> * </ul> * Running CheckClusterer with the debug option set will output the * training dataset for any failed tests.<p/> * * The <code>weka.clusterers.AbstractClustererTest</code> uses this * class to test all the clusterers. Any changes here, have to be * checked in that abstract test class, too. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -D * Turn on debugging output.</pre> * * <pre> -S * Silent mode - prints nothing to stdout.</pre> * * <pre> -N <num> * The number of instances in the datasets (default 20).</pre> * * <pre> -nominal <num> * The number of nominal attributes (default 2).</pre> * * <pre> -nominal-values <num> * The number of values for nominal attributes (default 1).</pre> * * <pre> -numeric <num> * The number of numeric attributes (default 1).</pre> * * <pre> -string <num> * The number of string attributes (default 1).</pre> * * <pre> -date <num> * The number of date attributes (default 1).</pre> * * <pre> -relational <num> * The number of relational attributes (default 1).</pre> * * <pre> -num-instances-relational <num> * The number of instances in relational/bag attributes (default 10).</pre> * * <pre> -words <comma-separated-list> * The words to use in string attributes.</pre> * * <pre> -word-separators <chars> * The word separators to use in string attributes.</pre> * * <pre> -W * Full name of the clusterer analyzed. * eg: weka.clusterers.SimpleKMeans * (default weka.clusterers.SimpleKMeans)</pre> * * <pre> * Options specific to clusterer weka.clusterers.SimpleKMeans: * </pre> * * <pre> -N <num> * number of clusters. * (default 2).</pre> * * <pre> -V * Display std. deviations for centroids. * </pre> * * <pre> -M * Replace missing values with mean/mode. * </pre> * * <pre> -S <num> * Random number seed. * (default 10)</pre> * <!-- options-end --> * * Options after -- are passed to the designated clusterer.<p/> * * @author Len Trigg (trigg@cs.waikato.ac.nz) * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision: 8034 $ * @see TestInstances */ public class CheckClusterer extends CheckScheme { /* * Note about test methods: * - methods return array of booleans * - first index: success or not * - second index: acceptable or not (e.g., Exception is OK) * * FracPete (fracpete at waikato dot ac dot nz) */ /*** The clusterer to be examined */ protected Clusterer m_Clusterer = new SimpleKMeans(); /** * default constructor */ public CheckClusterer() { super(); setNumInstances(40); } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector result = new Vector(); Enumeration en = super.listOptions(); while (en.hasMoreElements()) result.addElement(en.nextElement()); result.addElement(new Option( "\tFull name of the clusterer analyzed.\n" +"\teg: weka.clusterers.SimpleKMeans\n" + "\t(default weka.clusterers.SimpleKMeans)", "W", 1, "-W")); if ((m_Clusterer != null) && (m_Clusterer instanceof OptionHandler)) { result.addElement(new Option("", "", 0, "\nOptions specific to clusterer " + m_Clusterer.getClass().getName() + ":")); Enumeration enu = ((OptionHandler)m_Clusterer).listOptions(); while (enu.hasMoreElements()) result.addElement(enu.nextElement()); } return result.elements(); } /** * Parses a given list of options. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -D * Turn on debugging output.</pre> * * <pre> -S * Silent mode - prints nothing to stdout.</pre> * * <pre> -N <num> * The number of instances in the datasets (default 20).</pre> * * <pre> -nominal <num> * The number of nominal attributes (default 2).</pre> * * <pre> -nominal-values <num> * The number of values for nominal attributes (default 1).</pre> * * <pre> -numeric <num> * The number of numeric attributes (default 1).</pre> * * <pre> -string <num> * The number of string attributes (default 1).</pre> * * <pre> -date <num> * The number of date attributes (default 1).</pre> * * <pre> -relational <num> * The number of relational attributes (default 1).</pre> * * <pre> -num-instances-relational <num> * The number of instances in relational/bag attributes (default 10).</pre> * * <pre> -words <comma-separated-list> * The words to use in string attributes.</pre> * * <pre> -word-separators <chars> * The word separators to use in string attributes.</pre> * * <pre> -W * Full name of the clusterer analyzed. * eg: weka.clusterers.SimpleKMeans * (default weka.clusterers.SimpleKMeans)</pre> * * <pre> * Options specific to clusterer weka.clusterers.SimpleKMeans: * </pre> * * <pre> -N <num> * number of clusters. * (default 2).</pre> * * <pre> -V * Display std. deviations for centroids. * </pre> * * <pre> -M * Replace missing values with mean/mode. * </pre> * * <pre> -S <num> * Random number seed. * (default 10)</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String tmpStr; tmpStr = Utils.getOption('N', options); super.setOptions(options); if (tmpStr.length() != 0) setNumInstances(Integer.parseInt(tmpStr)); else setNumInstances(40); tmpStr = Utils.getOption('W', options); if (tmpStr.length() == 0) tmpStr = weka.clusterers.SimpleKMeans.class.getName(); setClusterer( (Clusterer) forName( "weka.clusterers", Clusterer.class, tmpStr, Utils.partitionOptions(options))); } /** * Gets the current settings of the CheckClusterer. * * @return an array of strings suitable for passing to setOptions */ public String[] getOptions() { Vector result; String[] options; int i; result = new Vector(); options = super.getOptions(); for (i = 0; i < options.length; i++) result.add(options[i]); if (getClusterer() != null) { result.add("-W"); result.add(getClusterer().getClass().getName()); } if ((m_Clusterer != null) && (m_Clusterer instanceof OptionHandler)) options = ((OptionHandler) m_Clusterer).getOptions(); else options = new String[0]; if (options.length > 0) { result.add("--"); for (i = 0; i < options.length; i++) result.add(options[i]); } return (String[]) result.toArray(new String[result.size()]); } /** * Begin the tests, reporting results to System.out */ public void doTests() { if (getClusterer() == null) { println("\n=== No clusterer set ==="); return; } println("\n=== Check on Clusterer: " + getClusterer().getClass().getName() + " ===\n"); // Start tests println("--> Checking for interfaces"); canTakeOptions(); boolean updateable = updateableClusterer()[0]; boolean weightedInstancesHandler = weightedInstancesHandler()[0]; boolean multiInstanceHandler = multiInstanceHandler()[0]; println("--> Clusterer tests"); declaresSerialVersionUID(); runTests(weightedInstancesHandler, multiInstanceHandler, updateable); } /** * Set the clusterer for testing. * * @param newClusterer the Clusterer to use. */ public void setClusterer(Clusterer newClusterer) { m_Clusterer = newClusterer; } /** * Get the clusterer used as the clusterer * * @return the clusterer used as the clusterer */ public Clusterer getClusterer() { return m_Clusterer; } /** * Run a battery of tests * * @param weighted true if the clusterer says it handles weights * @param multiInstance true if the clusterer is a multi-instance clusterer * @param updateable true if the classifier is updateable */ protected void runTests(boolean weighted, boolean multiInstance, boolean updateable) { boolean PNom = canPredict(true, false, false, false, false, multiInstance)[0]; boolean PNum = canPredict(false, true, false, false, false, multiInstance)[0]; boolean PStr = canPredict(false, false, true, false, false, multiInstance)[0]; boolean PDat = canPredict(false, false, false, true, false, multiInstance)[0]; boolean PRel; if (!multiInstance) PRel = canPredict(false, false, false, false, true, multiInstance)[0]; else PRel = false; if (PNom || PNum || PStr || PDat || PRel) { if (weighted) instanceWeights(PNom, PNum, PStr, PDat, PRel, multiInstance); canHandleZeroTraining(PNom, PNum, PStr, PDat, PRel, multiInstance); boolean handleMissingPredictors = canHandleMissing(PNom, PNum, PStr, PDat, PRel, multiInstance, true, 20)[0]; if (handleMissingPredictors) canHandleMissing(PNom, PNum, PStr, PDat, PRel, multiInstance, true, 100); correctBuildInitialisation(PNom, PNum, PStr, PDat, PRel, multiInstance); datasetIntegrity(PNom, PNum, PStr, PDat, PRel, multiInstance, handleMissingPredictors); if (updateable) updatingEquality(PNom, PNum, PStr, PDat, PRel, multiInstance); } } /** * Checks whether the scheme can take command line options. * * @return index 0 is true if the clusterer can take options */ protected boolean[] canTakeOptions() { boolean[] result = new boolean[2]; print("options..."); if (m_Clusterer instanceof OptionHandler) { println("yes"); if (m_Debug) { println("\n=== Full report ==="); Enumeration enu = ((OptionHandler)m_Clusterer).listOptions(); while (enu.hasMoreElements()) { Option option = (Option) enu.nextElement(); print(option.synopsis() + "\n" + option.description() + "\n"); } println("\n"); } result[0] = true; } else { println("no"); result[0] = false; } return result; } /** * Checks whether the scheme can build models incrementally. * * @return index 0 is true if the clusterer can train incrementally */ protected boolean[] updateableClusterer() { boolean[] result = new boolean[2]; print("updateable clusterer..."); if (m_Clusterer instanceof UpdateableClusterer) { println("yes"); result[0] = true; } else { println("no"); result[0] = false; } return result; } /** * Checks whether the scheme says it can handle instance weights. * * @return true if the clusterer handles instance weights */ protected boolean[] weightedInstancesHandler() { boolean[] result = new boolean[2]; print("weighted instances clusterer..."); if (m_Clusterer instanceof WeightedInstancesHandler) { println("yes"); result[0] = true; } else { println("no"); result[0] = false; } return result; } /** * Checks whether the scheme handles multi-instance data. * * @return true if the clusterer handles multi-instance data */ protected boolean[] multiInstanceHandler() { boolean[] result = new boolean[2]; print("multi-instance clusterer..."); if (m_Clusterer instanceof MultiInstanceCapabilitiesHandler) { println("yes"); result[0] = true; } else { println("no"); result[0] = false; } return result; } /** * tests for a serialVersionUID. Fails in case the scheme doesn't declare * a UID. * * @return index 0 is true if the scheme declares a UID */ protected boolean[] declaresSerialVersionUID() { boolean[] result = new boolean[2]; print("serialVersionUID..."); result[0] = !SerializationHelper.needsUID(m_Clusterer.getClass()); if (result[0]) println("yes"); else println("no"); return result; } /** * Checks basic prediction of the scheme, for simple non-troublesome * datasets. * * @param nominalPredictor if true use nominal predictor attributes * @param numericPredictor if true use numeric predictor attributes * @param stringPredictor if true use string predictor attributes * @param datePredictor if true use date predictor attributes * @param relationalPredictor if true use relational predictor attributes * @param multiInstance whether multi-instance is needed * @return index 0 is true if the test was passed, index 1 is true if test * was acceptable */ protected boolean[] canPredict( boolean nominalPredictor, boolean numericPredictor, boolean stringPredictor, boolean datePredictor, boolean relationalPredictor, boolean multiInstance) { print("basic predict"); printAttributeSummary( nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance); print("..."); FastVector accepts = new FastVector(); accepts.addElement("unary"); accepts.addElement("binary"); accepts.addElement("nominal"); accepts.addElement("numeric"); accepts.addElement("string"); accepts.addElement("date"); accepts.addElement("relational"); accepts.addElement("multi-instance"); accepts.addElement("not in classpath"); int numTrain = getNumInstances(), missingLevel = 0; boolean predictorMissing = false; return runBasicTest(nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance, missingLevel, predictorMissing, numTrain, accepts); } /** * Checks whether the scheme can handle zero training instances. * * @param nominalPredictor if true use nominal predictor attributes * @param numericPredictor if true use numeric predictor attributes * @param stringPredictor if true use string predictor attributes * @param datePredictor if true use date predictor attributes * @param relationalPredictor if true use relational predictor attributes * @param multiInstance whether multi-instance is needed * @return index 0 is true if the test was passed, index 1 is true if test * was acceptable */ protected boolean[] canHandleZeroTraining( boolean nominalPredictor, boolean numericPredictor, boolean stringPredictor, boolean datePredictor, boolean relationalPredictor, boolean multiInstance) { print("handle zero training instances"); printAttributeSummary( nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance); print("..."); FastVector accepts = new FastVector(); accepts.addElement("train"); accepts.addElement("value"); int numTrain = 0, missingLevel = 0; boolean predictorMissing = false; return runBasicTest( nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance, missingLevel, predictorMissing, numTrain, accepts); } /** * Checks whether the scheme correctly initialises models when * buildClusterer is called. This test calls buildClusterer with * one training dataset. buildClusterer is then called on a training set * with different structure, and then again with the original training set. * If the equals method of the ClusterEvaluation class returns * false, this is noted as incorrect build initialisation. * * @param nominalPredictor if true use nominal predictor attributes * @param numericPredictor if true use numeric predictor attributes * @param stringPredictor if true use string predictor attributes * @param datePredictor if true use date predictor attributes * @param relationalPredictor if true use relational predictor attributes * @param multiInstance whether multi-instance is needed * @return index 0 is true if the test was passed */ protected boolean[] correctBuildInitialisation( boolean nominalPredictor, boolean numericPredictor, boolean stringPredictor, boolean datePredictor, boolean relationalPredictor, boolean multiInstance) { boolean[] result = new boolean[2]; print("correct initialisation during buildClusterer"); printAttributeSummary( nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance); print("..."); int numTrain = getNumInstances(), missingLevel = 0; boolean predictorMissing = false; Instances train1 = null; Instances train2 = null; Clusterer clusterer = null; ClusterEvaluation evaluation1A = null; ClusterEvaluation evaluation1B = null; ClusterEvaluation evaluation2 = null; boolean built = false; int stage = 0; try { // Make two train sets with different numbers of attributes train1 = makeTestDataset(42, numTrain, nominalPredictor ? getNumNominal() : 0, numericPredictor ? getNumNumeric() : 0, stringPredictor ? getNumString() : 0, datePredictor ? getNumDate() : 0, relationalPredictor ? getNumRelational() : 0, multiInstance); train2 = makeTestDataset(84, numTrain, nominalPredictor ? getNumNominal() + 1 : 0, numericPredictor ? getNumNumeric() + 1 : 0, stringPredictor ? getNumString() : 0, datePredictor ? getNumDate() : 0, relationalPredictor ? getNumRelational() : 0, multiInstance); if (nominalPredictor && !multiInstance) { train1.deleteAttributeAt(0); train2.deleteAttributeAt(0); } if (missingLevel > 0) { addMissing(train1, missingLevel, predictorMissing); addMissing(train2, missingLevel, predictorMissing); } clusterer = AbstractClusterer.makeCopies(getClusterer(), 1)[0]; evaluation1A = new ClusterEvaluation(); evaluation1B = new ClusterEvaluation(); evaluation2 = new ClusterEvaluation(); } catch (Exception ex) { throw new Error("Error setting up for tests: " + ex.getMessage()); } try { stage = 0; clusterer.buildClusterer(train1); built = true; evaluation1A.setClusterer(clusterer); evaluation1A.evaluateClusterer(train1); stage = 1; built = false; clusterer.buildClusterer(train2); built = true; evaluation2.setClusterer(clusterer); evaluation2.evaluateClusterer(train2); stage = 2; built = false; clusterer.buildClusterer(train1); built = true; evaluation1B.setClusterer(clusterer); evaluation1B.evaluateClusterer(train1); stage = 3; if (!evaluation1A.equals(evaluation1B)) { if (m_Debug) { println("\n=== Full report ===\n"); println("First buildClusterer()"); println(evaluation1A.clusterResultsToString() + "\n\n"); println("Second buildClusterer()"); println(evaluation1B.clusterResultsToString() + "\n\n"); } throw new Exception("Results differ between buildClusterer calls"); } println("yes"); result[0] = true; if (false && m_Debug) { println("\n=== Full report ===\n"); println("First buildClusterer()"); println(evaluation1A.clusterResultsToString() + "\n\n"); println("Second buildClusterer()"); println(evaluation1B.clusterResultsToString() + "\n\n"); } } catch (Exception ex) { println("no"); result[0] = false; if (m_Debug) { println("\n=== Full Report ==="); print("Problem during"); if (built) { print(" testing"); } else { print(" training"); } switch (stage) { case 0: print(" of dataset 1"); break; case 1: print(" of dataset 2"); break; case 2: print(" of dataset 1 (2nd build)"); break; case 3: print(", comparing results from builds of dataset 1"); break; } println(": " + ex.getMessage() + "\n"); println("here are the datasets:\n"); println("=== Train1 Dataset ===\n" + train1.toString() + "\n"); println("=== Train2 Dataset ===\n" + train2.toString() + "\n"); } } return result; } /** * Checks basic missing value handling of the scheme. If the missing * values cause an exception to be thrown by the scheme, this will be * recorded. * * @param nominalPredictor if true use nominal predictor attributes * @param numericPredictor if true use numeric predictor attributes * @param stringPredictor if true use string predictor attributes * @param datePredictor if true use date predictor attributes * @param relationalPredictor if true use relational predictor attributes * @param multiInstance whether multi-instance is needed * @param predictorMissing true if the missing values may be in * the predictors * @param missingLevel the percentage of missing values * @return index 0 is true if the test was passed, index 1 is true if test * was acceptable */ protected boolean[] canHandleMissing( boolean nominalPredictor, boolean numericPredictor, boolean stringPredictor, boolean datePredictor, boolean relationalPredictor, boolean multiInstance, boolean predictorMissing, int missingLevel) { if (missingLevel == 100) print("100% "); print("missing"); if (predictorMissing) { print(" predictor"); } print(" values"); printAttributeSummary( nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance); print("..."); FastVector accepts = new FastVector(); accepts.addElement("missing"); accepts.addElement("value"); accepts.addElement("train"); int numTrain = getNumInstances(); return runBasicTest(nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance, missingLevel, predictorMissing, numTrain, accepts); } /** * Checks whether the clusterer can handle instance weights. * This test compares the clusterer performance on two datasets * that are identical except for the training weights. If the * results change, then the clusterer must be using the weights. It * may be possible to get a false positive from this test if the * weight changes aren't significant enough to induce a change * in clusterer performance (but the weights are chosen to minimize * the likelihood of this). * * @param nominalPredictor if true use nominal predictor attributes * @param numericPredictor if true use numeric predictor attributes * @param stringPredictor if true use string predictor attributes * @param datePredictor if true use date predictor attributes * @param relationalPredictor if true use relational predictor attributes * @param multiInstance whether multi-instance is needed * @return index 0 true if the test was passed */ protected boolean[] instanceWeights( boolean nominalPredictor, boolean numericPredictor, boolean stringPredictor, boolean datePredictor, boolean relationalPredictor, boolean multiInstance) { print("clusterer uses instance weights"); printAttributeSummary( nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance); print("..."); int numTrain = 2*getNumInstances(), missingLevel = 0; boolean predictorMissing = false; boolean[] result = new boolean[2]; Instances train = null; Clusterer [] clusterers = null; ClusterEvaluation evaluationB = null; ClusterEvaluation evaluationI = null; boolean built = false; boolean evalFail = false; try { train = makeTestDataset(42, numTrain, nominalPredictor ? getNumNominal() + 1 : 0, numericPredictor ? getNumNumeric() + 1 : 0, stringPredictor ? getNumString() : 0, datePredictor ? getNumDate() : 0, relationalPredictor ? getNumRelational() : 0, multiInstance); if (nominalPredictor && !multiInstance) train.deleteAttributeAt(0); if (missingLevel > 0) addMissing(train, missingLevel, predictorMissing); clusterers = AbstractClusterer.makeCopies(getClusterer(), 2); evaluationB = new ClusterEvaluation(); evaluationI = new ClusterEvaluation(); clusterers[0].buildClusterer(train); evaluationB.setClusterer(clusterers[0]); } catch (Exception ex) { throw new Error("Error setting up for tests: " + ex.getMessage()); } try { // Now modify instance weights and re-built/test for (int i = 0; i < train.numInstances(); i++) { train.instance(i).setWeight(0); } Random random = new Random(1); for (int i = 0; i < train.numInstances() / 2; i++) { int inst = Math.abs(random.nextInt()) % train.numInstances(); int weight = Math.abs(random.nextInt()) % 10 + 1; train.instance(inst).setWeight(weight); } clusterers[1].buildClusterer(train); built = true; evaluationI.setClusterer(clusterers[1]); if (evaluationB.equals(evaluationI)) { // println("no"); evalFail = true; throw new Exception("evalFail"); } println("yes"); result[0] = true; } catch (Exception ex) { println("no"); result[0] = false; if (m_Debug) { println("\n=== Full Report ==="); if (evalFail) { println("Results don't differ between non-weighted and " + "weighted instance models."); println("Here are the results:\n"); println("\nboth methods\n"); println(evaluationB.clusterResultsToString()); } else { print("Problem during"); if (built) { print(" testing"); } else { print(" training"); } println(": " + ex.getMessage() + "\n"); } println("Here is the dataset:\n"); println("=== Train Dataset ===\n" + train.toString() + "\n"); println("=== Train Weights ===\n"); for (int i = 0; i < train.numInstances(); i++) { println(" " + (i + 1) + " " + train.instance(i).weight()); } } } return result; } /** * Checks whether the scheme alters the training dataset during * training. If the scheme needs to modify the training * data it should take a copy of the training data. Currently checks * for changes to header structure, number of instances, order of * instances, instance weights. * * @param nominalPredictor if true use nominal predictor attributes * @param numericPredictor if true use numeric predictor attributes * @param stringPredictor if true use string predictor attributes * @param datePredictor if true use date predictor attributes * @param relationalPredictor if true use relational predictor attributes * @param multiInstance whether multi-instance is needed * @param predictorMissing true if we know the clusterer can handle * (at least) moderate missing predictor values * @return index 0 is true if the test was passed */ protected boolean[] datasetIntegrity( boolean nominalPredictor, boolean numericPredictor, boolean stringPredictor, boolean datePredictor, boolean relationalPredictor, boolean multiInstance, boolean predictorMissing) { print("clusterer doesn't alter original datasets"); printAttributeSummary( nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance); print("..."); int numTrain = getNumInstances(), missingLevel = 20; boolean[] result = new boolean[2]; Instances train = null; Clusterer clusterer = null; try { train = makeTestDataset(42, numTrain, nominalPredictor ? getNumNominal() : 0, numericPredictor ? getNumNumeric() : 0, stringPredictor ? getNumString() : 0, datePredictor ? getNumDate() : 0, relationalPredictor ? getNumRelational() : 0, multiInstance); if (nominalPredictor && !multiInstance) train.deleteAttributeAt(0); if (missingLevel > 0) addMissing(train, missingLevel, predictorMissing); clusterer = AbstractClusterer.makeCopies(getClusterer(), 1)[0]; } catch (Exception ex) { throw new Error("Error setting up for tests: " + ex.getMessage()); } try { Instances trainCopy = new Instances(train); clusterer.buildClusterer(trainCopy); compareDatasets(train, trainCopy); println("yes"); result[0] = true; } catch (Exception ex) { println("no"); result[0] = false; if (m_Debug) { println("\n=== Full Report ==="); print("Problem during training"); println(": " + ex.getMessage() + "\n"); println("Here is the dataset:\n"); println("=== Train Dataset ===\n" + train.toString() + "\n"); } } return result; } /** * Checks whether an updateable scheme produces the same model when * trained incrementally as when batch trained. The model itself * cannot be compared, so we compare the evaluation on test data * for both models. It is possible to get a false positive on this * test (likelihood depends on the classifier). * * @param nominalPredictor if true use nominal predictor attributes * @param numericPredictor if true use numeric predictor attributes * @param stringPredictor if true use string predictor attributes * @param datePredictor if true use date predictor attributes * @param relationalPredictor if true use relational predictor attributes * @param multiInstance whether multi-instance is needed * @return index 0 is true if the test was passed */ protected boolean[] updatingEquality( boolean nominalPredictor, boolean numericPredictor, boolean stringPredictor, boolean datePredictor, boolean relationalPredictor, boolean multiInstance) { print("incremental training produces the same results" + " as batch training"); printAttributeSummary( nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance); print("..."); int numTrain = getNumInstances(), missingLevel = 0; boolean predictorMissing = false, classMissing = false; boolean[] result = new boolean[2]; Instances train = null; Clusterer[] clusterers = null; ClusterEvaluation evaluationB = null; ClusterEvaluation evaluationI = null; boolean built = false; try { train = makeTestDataset(42, numTrain, nominalPredictor ? getNumNominal() : 0, numericPredictor ? getNumNumeric() : 0, stringPredictor ? getNumString() : 0, datePredictor ? getNumDate() : 0, relationalPredictor ? getNumRelational() : 0, multiInstance); if (missingLevel > 0) addMissing(train, missingLevel, predictorMissing, classMissing); clusterers = AbstractClusterer.makeCopies(getClusterer(), 2); evaluationB = new ClusterEvaluation(); evaluationI = new ClusterEvaluation(); clusterers[0].buildClusterer(train); evaluationB.setClusterer(clusterers[0]); } catch (Exception ex) { throw new Error("Error setting up for tests: " + ex.getMessage()); } try { clusterers[1].buildClusterer(new Instances(train, 0)); for (int i = 0; i < train.numInstances(); i++) { ((UpdateableClusterer)clusterers[1]).updateClusterer( train.instance(i)); } built = true; evaluationI.setClusterer(clusterers[1]); if (!evaluationB.equals(evaluationI)) { println("no"); result[0] = false; if (m_Debug) { println("\n=== Full Report ==="); println("Results differ between batch and " + "incrementally built models.\n" + "Depending on the classifier, this may be OK"); println("Here are the results:\n"); println("\nbatch built results\n" + evaluationB.clusterResultsToString()); println("\nincrementally built results\n" + evaluationI.clusterResultsToString()); println("Here are the datasets:\n"); println("=== Train Dataset ===\n" + train.toString() + "\n"); } } else { println("yes"); result[0] = true; } } catch (Exception ex) { result[0] = false; print("Problem during"); if (built) print(" testing"); else print(" training"); println(": " + ex.getMessage() + "\n"); } return result; } /** * Runs a text on the datasets with the given characteristics. * * @param nominalPredictor if true use nominal predictor attributes * @param numericPredictor if true use numeric predictor attributes * @param stringPredictor if true use string predictor attributes * @param datePredictor if true use date predictor attributes * @param relationalPredictor if true use relational predictor attributes * @param multiInstance whether multi-instance is needed * @param missingLevel the percentage of missing values * @param predictorMissing true if the missing values may be in * the predictors * @param numTrain the number of instances in the training set * @param accepts the acceptable string in an exception * @return index 0 is true if the test was passed, index 1 is true if test * was acceptable */ protected boolean[] runBasicTest(boolean nominalPredictor, boolean numericPredictor, boolean stringPredictor, boolean datePredictor, boolean relationalPredictor, boolean multiInstance, int missingLevel, boolean predictorMissing, int numTrain, FastVector accepts) { boolean[] result = new boolean[2]; Instances train = null; Clusterer clusterer = null; try { train = makeTestDataset(42, numTrain, nominalPredictor ? getNumNominal() : 0, numericPredictor ? getNumNumeric() : 0, stringPredictor ? getNumString() : 0, datePredictor ? getNumDate() : 0, relationalPredictor ? getNumRelational() : 0, multiInstance); if (nominalPredictor && !multiInstance) train.deleteAttributeAt(0); if (missingLevel > 0) addMissing(train, missingLevel, predictorMissing); clusterer = AbstractClusterer.makeCopies(getClusterer(), 1)[0]; } catch (Exception ex) { ex.printStackTrace(); throw new Error("Error setting up for tests: " + ex.getMessage()); } try { clusterer.buildClusterer(train); println("yes"); result[0] = true; } catch (Exception ex) { boolean acceptable = false; String msg = ex.getMessage().toLowerCase(); for (int i = 0; i < accepts.size(); i++) { if (msg.indexOf((String)accepts.elementAt(i)) >= 0) { acceptable = true; } } println("no" + (acceptable ? " (OK error message)" : "")); result[1] = acceptable; if (m_Debug) { println("\n=== Full Report ==="); print("Problem during training"); println(": " + ex.getMessage() + "\n"); if (!acceptable) { if (accepts.size() > 0) { print("Error message doesn't mention "); for (int i = 0; i < accepts.size(); i++) { if (i != 0) { print(" or "); } print('"' + (String)accepts.elementAt(i) + '"'); } } println("here is the dataset:\n"); println("=== Train Dataset ===\n" + train.toString() + "\n"); } } } return result; } /** * Add missing values to a dataset. * * @param data the instances to add missing values to * @param level the level of missing values to add (if positive, this * is the probability that a value will be set to missing, if negative * all but one value will be set to missing (not yet implemented)) * @param predictorMissing if true, predictor attributes will be modified */ protected void addMissing(Instances data, int level, boolean predictorMissing) { Random random = new Random(1); for (int i = 0; i < data.numInstances(); i++) { Instance current = data.instance(i); for (int j = 0; j < data.numAttributes(); j++) { if (predictorMissing) { if (Math.abs(random.nextInt()) % 100 < level) current.setMissing(j); } } } } /** * Make a simple set of instances with variable position of the class * attribute, which can later be modified for use in specific tests. * * @param seed the random number seed * @param numInstances the number of instances to generate * @param numNominal the number of nominal attributes * @param numNumeric the number of numeric attributes * @param numString the number of string attributes * @param numDate the number of date attributes * @param numRelational the number of relational attributes * @param multiInstance whether the dataset should a multi-instance dataset * @return the test dataset * @throws Exception if the dataset couldn't be generated * @see TestInstances#CLASS_IS_LAST */ protected Instances makeTestDataset(int seed, int numInstances, int numNominal, int numNumeric, int numString, int numDate, int numRelational, boolean multiInstance) throws Exception { TestInstances dataset = new TestInstances(); dataset.setSeed(seed); dataset.setNumInstances(numInstances); dataset.setNumNominal(numNominal); dataset.setNumNumeric(numNumeric); dataset.setNumString(numString); dataset.setNumDate(numDate); dataset.setNumRelational(numRelational); dataset.setClassIndex(TestInstances.NO_CLASS); dataset.setMultiInstance(multiInstance); return dataset.generate(); } /** * Print out a short summary string for the dataset characteristics * * @param nominalPredictor true if nominal predictor attributes are present * @param numericPredictor true if numeric predictor attributes are present * @param stringPredictor true if string predictor attributes are present * @param datePredictor true if date predictor attributes are present * @param relationalPredictor true if relational predictor attributes are present * @param multiInstance whether multi-instance is needed */ protected void printAttributeSummary(boolean nominalPredictor, boolean numericPredictor, boolean stringPredictor, boolean datePredictor, boolean relationalPredictor, boolean multiInstance) { String str = ""; if (numericPredictor) str += "numeric"; if (nominalPredictor) { if (str.length() > 0) str += " & "; str += "nominal"; } if (stringPredictor) { if (str.length() > 0) str += " & "; str += "string"; } if (datePredictor) { if (str.length() > 0) str += " & "; str += "date"; } if (relationalPredictor) { if (str.length() > 0) str += " & "; str += "relational"; } str = " (" + str + " predictors)"; print(str); } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 8034 $"); } /** * Test method for this class * * @param args the commandline options */ public static void main(String [] args) { runCheck(new CheckClusterer(), args); } }