/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* ClusterEvaluation.java
* Copyright (C) 1999 Mark Hall
*
*/
package weka.clusterers;
import java.util.*;
import java.io.*;
import weka.core.*;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Remove;
/**
* Class for evaluating clustering models.<p>
*
* Valid options are: <p>
*
* -t <name of the training file> <br>
* Specify the training file. <p>
*
* -T <name of the test file> <br>
* Specify the test file to apply clusterer to. <p>
*
* -d <name of file to save clustering model to> <br>
* Specify output file. <p>
*
* -l <name of file to load clustering model from> <br>
* Specifiy input file. <p>
*
* -p <attribute range> <br>
* Output predictions. Predictions are for the training file if only the
* training file is specified, otherwise they are for the test file. The range
* specifies attribute values to be output with the predictions.
* Use '-p 0' for none. <p>
*
* -x <num folds> <br>
* Set the number of folds for a cross validation of the training data.
* Cross validation can only be done for distribution clusterers and will
* be performed if the test file is missing. <p>
*
* -c <class> <br>
* Set the class attribute. If set, then class based evaluation of clustering
* is performed. <p>
*
* @author Mark Hall (mhall@cs.waikato.ac.nz)
* @version $Revision: 1.1.1.1 $
*/
public class ClusterEvaluation {
/** the instances to cluster */
private Instances instances;
/** the clusterer */
private Clusterer clusterer;
/** do cross validation (DistributionClusterers only) */
private boolean m_doXval;
/** the number of folds to use for cross validation */
private int m_numFolds;
/** seed to use for cross validation */
private int m_seed;
/** holds a string describing the results of clustering the training data */
private StringBuffer results;
private int numInstances;
private int numClusters;
private int numClasses;
/** holds the assigments of instances to clusters for a particular testing
dataset */
private double [] assignments;
/** will hold the mapping of classes to clusters (for class based
evaluation) */
private int [] m_classToCluster = null;
/**
* set the clusterer
* @param clusterer the clusterer to use
*/
public void setClusterer(Clusterer clusterer) {
this.clusterer=clusterer;
numClusters=clusterer.K;
setInstances(clusterer.instances);
}
public void setInstances(Instances instances){
this.instances=instances;
numInstances=instances.numInstances();
if(instances.classIndex()>-1)numClasses=instances.classAttribute().numValues();
}
/**
* set whether or not to do cross validation
* @param x true if cross validation is to be done
*/
public void setDoXval(boolean x) {
m_doXval = x;
}
/**
* set the number of folds to use for cross validation
* @param folds the number of folds
*/
public void setFolds(int folds) {
m_numFolds = folds;
}
/**
* set the seed to use for cross validation
* @param s the seed.
*/
public void setSeed(int s) {
m_seed = s;
}
/**
* return the results of clustering.
* @return a string detailing the results of clustering a data set
*/
public String clusterResultsToString() {
return results.toString();
}
/**
* Return the number of clusters found for the most recent call to
* evaluateClusterer
* @return the number of clusters found
*/
public int getNumClusters() {
return numClusters;
}
/**
* Return an array of cluster assignments corresponding to the most
* recent set of instances clustered.
* @return an array of cluster assignments
*/
public double [] getClusterAssignments() {
return assignments;
}
/**
* Return the array (ordered by cluster number) of minimum error class to
* cluster mappings
* @return an array of class to cluster mappings
*/
public int [] getClassesToClusters() {
return m_classToCluster;
}
/**
* Constructor. Sets defaults for each member variable. Default Clusterer
* is EM.
*/
public ClusterEvaluation () {
this(new EM());
}
public ClusterEvaluation(Clusterer clusterer){
this(clusterer,clusterer.instances);
}
public ClusterEvaluation(Clusterer clusterer,Instances instances){
setFolds(10);
setDoXval(false);
setSeed(1);
setClusterer(clusterer);
setInstances(instances);
results=new StringBuffer();
assignments=null;
}
/**
* Evaluate the clusterer on a set of instances. Calculates clustering
* statistics and stores cluster assigments for the instances in
* assignments
* @param test the set of instances to cluster
* @exception Exception if something goes wrong
*/
public void evaluateClusterer(Instances test) throws Exception {
setInstances(test);
clusterer.clusterInstances(test);
evaluateClusterer();
}
public void evaluateClusterer()throws Exception{
int i = 0;
double[] dist;
double temp;
int numInstFieldWidth = (int)((Math.log(numInstances)/Math.log(10))+1);
double[] instanceStats = new double[numClusters];
int unclusteredInstances = 0;
assignments=clusterer.getAssignments();
for (i=0;i<numInstances;i++) {
if(assignments[i]==-1){
unclusteredInstances++;
}else{
instanceStats[(int)assignments[i]]++;
}
}
double sum = Utils.sum(instanceStats);
results.append(clusterer.toString());
results.append(" Clustered Instances\n\n");
int clustFieldWidth = (int)((Math.log(numClusters)/Math.log(10))+1);
for (i = 0; i < numClusters; i++) {
results.append(Utils.doubleToString((double)i,clustFieldWidth,0));
results.append("\t");
results.append(Utils.doubleToString(instanceStats[i],numInstFieldWidth,0));
results.append(" (");
results.append(Utils.doubleToString((instanceStats[i]/sum*100.0),3,0));
results.append("%)\n");
}
results.append("\nUnclustered instances : "+unclusteredInstances);
if(clusterer instanceof DistributionClusterer){
int loglk=0;
double t=0;
for(i=0;i<numInstances;i++){
t=((DistributionClusterer)clusterer).densityForInstance(instances.instance(i));
if(t>0)loglk+=Math.log(t);
}
loglk/=sum;
results.append("\nLog likelihood: "+Utils.doubleToString(loglk,1,5)+"\n");
}
if(instances.classIndex()!=-1)evaluateClustersWithRespectToClass();
}
public static String evaluateClusterer(Clusterer clusterer)throws Exception{
return evaluateClusterer(clusterer,clusterer.instances);
}
public static String evaluateClusterer(Clusterer clusterer,Instances instances)throws Exception{
ClusterEvaluation ce=new ClusterEvaluation(clusterer,instances);
ce.assignments=ce.clusterer.getAssignments();
ce.evaluateClustersWithRespectToClass();
return ce.results.toString();
}
/**
* Evaluates cluster assignments with respect to actual class labels.
* Assumes that clusterer has been trained and tested on
* inst (minus the class).
* @param inst the instances (including class) to evaluate with respect to
* @exception Exception if something goes wrong
*/
public void evaluateClustersWithRespectToClass()throws Exception{
evaluateClustersWithRespectToClass(instances.allClasses(),instances.classes());
}
public void evaluateClustersWithRespectToClass(double[] allClasses,String[] classes)throws Exception{
int [][] counts = new int [numClusters][numClasses];
int [] clusterTotals = new int[numClusters];
int [] classTotals=new int[numClasses];
double [] best = new double[numClusters+1];
double [] current = new double[numClusters+1];
for(int i=0;i<numInstances;i++){
int clusterId=(int)assignments[i];
int classId=(int)allClasses[i];
counts[clusterId][classId]++;
clusterTotals[clusterId]++;
classTotals[classId]++;
}
best[numClusters] = Double.MAX_VALUE;
mapClasses(0, counts, clusterTotals, current, best, 0);
results.append("\n\nClasses: ");
for(int i=0;i<numClasses;i++)results.append(classes[i]).append("\t");
results.append("\nClasses to Clusters:\n");
results.append(Utils.toString(counts,"Cluster","Class"));
int Cwidth = 1 + (int)(Math.log(numClusters) / Math.log(10));
// add the minimum error assignment
for (int i = 0; i < numClusters; i++) {
if (clusterTotals[i] > 0) {
results.append("\nCluster ");
results.append(Utils.toString((double)i,Cwidth));
results.append(" <-- ");
if (best[i] < 0) {
results.append("No class\n");
} else {
results.append(classes[(int)best[i]]).append("\n");
}
}
}
results.append("\nIncorrectly clustered instances :\t");
results.append(best[numClusters]);
results.append("\t");
results.append(Utils.doubleToString((best[numClusters] /
numInstances *
100.0), 8, 4));
results.append(" %\n");
double [][] p=new double[numClusters][numClasses];
double [][] r=new double[numClusters][numClasses];
double [][] f=new double[numClusters][numClasses];
double [] p1=new double[numClusters];
double [] p2=new double[numClasses];
double [] Es=new double[numClusters];
double [] Ps=new double[numClusters];
double [] Fs=new double[numClasses];
double E=0,P=0,F=0,H1=0,H2=0,NMI;
for(int i=0;i<numClusters;i++){
p1[i]=(double)clusterTotals[i]/numInstances;
for(int j=0;j<numClasses;j++){
if(counts[i][j]==0){
p[i][j]=r[i][j]=0;
f[i][j]=0;
}else{
p[i][j]=(double)counts[i][j]/clusterTotals[i];
r[i][j]=(double)counts[i][j]/classTotals[j];
f[i][j]=2*p[i][j]*r[i][j]/(p[i][j]+r[i][j]);
Es[i]+=-p[i][j]*Math.log(p[i][j])/Math.log(2);
if(p[i][j]>Ps[i])Ps[i]=p[i][j];
if(f[i][j]>Fs[j])Fs[j]=f[i][j];
}
}
E+=p1[i]*Es[i];
P+=p1[i]*Ps[i];
H1+=-p1[i]*Math.log(p1[i])/Math.log(2);
}
for(int j=0;j<numClasses;j++){
p2[j]=(double)classTotals[j]/numInstances;
F+=p2[j]*Fs[j];
H2+=-p2[j]*Math.log(p2[j])/Math.log(2);
}
NMI=(H2-E)/Math.sqrt(H1*H2);
results.append("E="+E+" P="+P+" F="+F+" H1="+H1+" H2="+H2+" NMI="+NMI+"\n");
// copy the class assignments
m_classToCluster = new int [numClusters];
for (int i = 0; i < numClusters; i++) {
m_classToCluster[i] = (int)best[i];
}
}
/**
* Finds the minimum error mapping of classes to clusters. Recursively
* considers all possible class to cluster assignments.
* @param lev the cluster being processed
* @param counts the counts of classes in clusters
* @param clusterTotals the total number of examples in each cluster
* @param current the current path through the class to cluster assignment
* tree
* @param best the best assignment path seen
* @param error accumulates the error for a particular path
*/
private void mapClasses(int lev, int [][] counts, int [] clusterTotals,
double [] current, double [] best, int error) {
// leaf
if (lev == numClusters) {
if (error < best[numClusters]) {
best[numClusters] = error;
for (int i = 0; i < numClusters; i++) {
best[i] = current[i];
}
}
} else {
// empty cluster -- ignore
if (clusterTotals[lev] == 0) {
current[lev] = -1; // cluster ignored
mapClasses(lev+1, counts, clusterTotals, current, best,
error);
} else {
// first try no class assignment to this cluster
current[lev] = -1; // cluster assigned no class (ie all errors)
mapClasses(lev+1, counts, clusterTotals, current, best,
error+clusterTotals[lev]);
// now loop through the classes in this cluster
for (int i = 0; i < counts[0].length; i++) {
if (counts[lev][i] > 0) {
boolean ok = true;
// check to see if this class has already been assigned
for (int j = 0; j < lev; j++) {
if ((int)current[j] == i) {
ok = false;
break;
}
}
if (ok) {
current[lev] = i;
mapClasses(lev+1, counts, clusterTotals, current, best,
(error + (clusterTotals[lev] - counts[lev][i])));
}
}
}
}
}
}
/**
* Evaluates a clusterer with the options given in an array of
* strings. It takes the string indicated by "-t" as training file, the
* string indicated by "-T" as test file.
* If the test file is missing, a stratified ten-fold
* cross-validation is performed (distribution clusterers only).
* Using "-x" you can change the number of
* folds to be used, and using "-s" the random seed.
* If the "-p" option is present it outputs the classification for
* each test instance. If you provide the name of an object file using
* "-l", a clusterer will be loaded from the given file. If you provide the
* name of an object file using "-d", the clusterer built from the
* training data will be saved to the given file.
*
* @param clusterer machine learning clusterer
* @param options the array of string containing the options
* @exception Exception if model could not be evaluated successfully
* @return a string describing the results
*/
public static String evaluateClusterer (Clusterer clusterer,
String[] options)
throws Exception {
int seed = 1, folds = 10;
boolean doXval = false;
Instances train = null;
Instances test = null;
Random random;
String trainFileName, testFileName, seedString, foldsString, objectInputFileName, objectOutputFileName, attributeRangeString;
String[] savedOptions = null;
boolean printClusterAssignments = false;
Range attributesToOutput = null;
ObjectInputStream objectInputStream = null;
ObjectOutputStream objectOutputStream = null;
StringBuffer text = new StringBuffer();
int theClass = -1; // class based evaluation of clustering
try {
if (Utils.getFlag('h', options)) {
throw new Exception("Help requested.");
}
// Get basic options (options the same for all clusterers
objectInputFileName = Utils.getOption('l', options);
objectOutputFileName = Utils.getOption('d', options);
trainFileName = Utils.getOption('t', options);
testFileName = Utils.getOption('T', options);
// Check -p option
try{
attributeRangeString=Utils.getOption('p',options);
}catch(Exception e){
throw new Exception(e.getMessage()+"\nNOTE: the -p option expects a parameter specifying a range of attributes to list with the predictions. Use '-p 0' for none.");
}
if(attributeRangeString.length()!=0){
printClusterAssignments=true;
if(!attributeRangeString.equals("0"))attributesToOutput=new Range(attributeRangeString);
}
if (trainFileName.length() == 0) {
if (objectInputFileName.length() == 0) {
throw new Exception("No training file and no object "
+ "input file given.");
}
if (testFileName.length() == 0) {
throw new Exception("No training file and no test file given.");
}
}
else {
if ((objectInputFileName.length() != 0)
&& (printClusterAssignments == false)) {
throw new Exception("Can't use both train and model file "
+ "unless -p specified.");
}
}
seedString = Utils.getOption('s', options);
if (seedString.length() != 0) {
seed = Integer.parseInt(seedString);
}
foldsString = Utils.getOption('x', options);
if (foldsString.length() != 0) {
folds = Integer.parseInt(foldsString);
doXval = true;
}
}
catch (Exception e) {
throw new Exception('\n'+e.getMessage()+makeOptionString(clusterer));
}
try{
if(trainFileName.length()!=0){
System.out.println("Reading train file: "+trainFileName);
train=new Instances(new BufferedReader(new FileReader(trainFileName)));
System.out.println("Relation: "+train.relationName());
System.out.println("\tnumInstances: "+train.numInstances());
System.out.println("\tnumAttributes: "+train.numAttributes());
String classString = Utils.getOption('c',options);
if (classString.length() != 0) {
if (classString.compareTo("last") == 0) {
theClass = train.numAttributes();
} else if (classString.compareTo("first") == 0) {
theClass = 1;
} else {
theClass = Integer.parseInt(classString);
}
if (doXval || testFileName.length() != 0) {
throw new Exception("Can only do class based evaluation on the "
+"training data");
}
if (objectInputFileName.length() != 0) {
throw new Exception("Can't load a clusterer and do class based "
+"evaluation");
}
}
if(theClass!=-1){
if(!train.attribute(theClass-1).isNominal())throw new Exception("Class must be nominal!");
System.out.println("Setting classIndex: "+(theClass-1));
train.setClassIndex(theClass-1);
}
}
if (objectInputFileName.length() != 0) {
objectInputStream = new ObjectInputStream(new FileInputStream(objectInputFileName));
}
if (objectOutputFileName.length() != 0) {
objectOutputStream = new
ObjectOutputStream(new FileOutputStream(objectOutputFileName));
}
}
catch (Exception e) {
throw new Exception("ClusterEvaluation: " + e.getMessage() + '.');
}
// Save options
if (options != null) {
savedOptions = new String[options.length];
System.arraycopy(options, 0, savedOptions, 0, options.length);
}
if (objectInputFileName.length() != 0) {
Utils.checkForRemainingOptions(options);
clusterer = (Clusterer)objectInputStream.readObject();
objectInputStream.close();
}else{
System.out.println("About to set options for clusterer ...");
if(clusterer instanceof OptionHandler)((OptionHandler)clusterer).setOptions(options);
Utils.checkForRemainingOptions(options);
// Build the clusterer if no object file provided
if(train.classIndex()==-1)clusterer.buildClusterer(train);
else{
System.out.println("About to remove classAttribute at position "+theClass);
Remove filter=new Remove();
filter.setAttributeIndices(""+theClass);
filter.setInvertSelection(false);
filter.setInputFormat(train);
Instances clusterTrain = Filter.useFilter(train,filter);
System.out.println("\tnumAttributes: "+clusterTrain.numAttributes());
System.out.println("\tclassIndex: "+clusterTrain.classIndex());
if(clusterer.K==0)clusterer.K=train.classAttribute().numValues();
clusterer.buildClusterer(clusterTrain);
ClusterEvaluation ce=new ClusterEvaluation(clusterer,train);
ce.evaluateClusterer();
return "\n\n=== Clustering stats for training data ===\n\n"+ce.clusterResultsToString();
}
}
/* Output cluster predictions only (for the test data if specified,
otherwise for the training data */
if (printClusterAssignments) {
return printClusterings(clusterer, train, testFileName, attributesToOutput);
}
text.append(clusterer.toString());
text.append("\n\n=== Clustering stats for training data ===\n\n"
+ printClusterStats(clusterer, trainFileName));
if (testFileName.length() != 0) {
text.append("\n\n=== Clustering stats for testing data ===\n\n"
+ printClusterStats(clusterer, testFileName));
}
if ((clusterer instanceof DistributionClusterer) &&
(doXval == true) &&
(testFileName.length() == 0) &&
(objectInputFileName.length() == 0)) {
// cross validate the log likelihood on the training data
random = new Random(seed);
random.setSeed(seed);
train.randomize(random);
text.append(crossValidateModel(clusterer.getClass().getName()
, train, folds, savedOptions));
}
// Save the clusterer if an object output file is provided
if (objectOutputFileName.length() != 0) {
objectOutputStream.writeObject(clusterer);
objectOutputStream.flush();
objectOutputStream.close();
}
return text.toString();
}
/**
* Performs a cross-validation
* for a distribution clusterer on a set of instances.
*
* @param clustererString a string naming the class of the clusterer
* @param data the data on which the cross-validation is to be
* performed
* @param numFolds the number of folds for the cross-validation
* @param options the options to the clusterer
* @return a string containing the cross validated log likelihood
* @exception Exception if a clusterer could not be generated
*/
public static String crossValidateModel (String clustererString,
Instances data,
int numFolds,
String[] options)
throws Exception {
Clusterer clusterer = null;
Instances train, test;
String[] savedOptions = null;
double foldAv;
double CvAv = 0.0;
double[] tempDist;
StringBuffer CvString = new StringBuffer();
if (options != null) {
savedOptions = new String[options.length];
}
data = new Instances(data);
for (int i = 0; i < numFolds; i++) {
// create clusterer
try {
clusterer = (Clusterer)Class.forName(clustererString).newInstance();
}
catch (Exception e) {
throw new Exception("Can't find class with name "
+ clustererString + '.');
}
if (!(clusterer instanceof DistributionClusterer)) {
throw new Exception(clustererString
+ " must be a distrinbution "
+ "clusterer.");
}
// Save options
if (options != null) {
System.arraycopy(options, 0, savedOptions, 0, options.length);
}
// Parse options
if (clusterer instanceof OptionHandler) {
try {
((OptionHandler)clusterer).setOptions(savedOptions);
Utils.checkForRemainingOptions(savedOptions);
}
catch (Exception e) {
throw new Exception("Can't parse given options in "
+ "cross-validation!");
}
}
// Build and test classifier
train = data.trainCV(numFolds, i);
clusterer.buildClusterer(train);
test = data.testCV(numFolds, i);
foldAv = 0.0;
for (int j = 0; j < test.numInstances(); j++) {
try {
double temp = ((DistributionClusterer)clusterer).
densityForInstance(test.instance(j));
// double temp = Utils.sum(tempDist);
if (temp > 0) {
foldAv += Math.log(temp);
}
} catch (Exception ex) {
// unclustered instances
}
}
CvAv += (foldAv/test.numInstances());
}
CvAv /= numFolds;
CvString.append("\n" + numFolds
+ " fold CV Log Likelihood: "
+ Utils.doubleToString(CvAv, 6, 4)
+ "\n");
return CvString.toString();
}
// ===============
// Private methods
// ===============
/**
* Print the cluster statistics for either the training
* or the testing data.
*
* @param clusterer the clusterer to use for generating statistics.
* @return a string containing cluster statistics.
* @exception if statistics can't be generated.
*/
private static String printClusterStats (Clusterer clusterer,
String fileName)
throws Exception {
StringBuffer text = new StringBuffer();
int i = 0;
int cnum;
double loglk = 0.0;
double[] dist;
double temp;
int numClusters = clusterer.numberOfClusters();
double[] instanceStats = new double[numClusters];
int unclusteredInstances = 0;
if (fileName.length() != 0) {
BufferedReader inStream = null;
try {
inStream = new BufferedReader(new FileReader(fileName));
}
catch (Exception e) {
throw new Exception("Can't open file " + e.getMessage() + '.');
}
Instances inst = new Instances(inStream, 1);
while (inst.readInstance(inStream)) {
try {
cnum = clusterer.clusterInstance(inst.instance(0));
if (clusterer instanceof DistributionClusterer) {
temp = ((DistributionClusterer)clusterer).
densityForInstance(inst.instance(0));
// temp = Utils.sum(dist);
if (temp > 0) {
loglk += Math.log(temp);
}
}
instanceStats[cnum]++;
}
catch (Exception e) {
unclusteredInstances++;
}
inst.delete(0);
i++;
}
/*
// count the actual number of used clusters
int count = 0;
for (i = 0; i < numClusters; i++) {
if (instanceStats[i] > 0) {
count++;
}
}
if (count > 0) {
double [] tempStats = new double [count];
count=0;
for (i=0;i<numClusters;i++) {
if (instanceStats[i] > 0) {
tempStats[count++] = instanceStats[i];
}
}
instanceStats = tempStats;
numClusters = instanceStats.length;
} */
int clustFieldWidth = (int)((Math.log(numClusters)/Math.log(10))+1);
int numInstFieldWidth = (int)((Math.log(i)/Math.log(10))+1);
double sum = Utils.sum(instanceStats);
loglk /= sum;
text.append("Clustered Instances\n");
for (i = 0; i < numClusters; i++) {
if (instanceStats[i] > 0) {
text.append(Utils.doubleToString((double)i,
clustFieldWidth, 0)
+ " "
+ Utils.doubleToString(instanceStats[i],
numInstFieldWidth, 0)
+ " ("
+ Utils.doubleToString((instanceStats[i]/sum*100.0)
, 3, 0) + "%)\n");
}
}
if (unclusteredInstances > 0) {
text.append("\nUnclustered Instances : "+unclusteredInstances);
}
if (clusterer instanceof DistributionClusterer) {
text.append("\n\nLog likelihood: "
+ Utils.doubleToString(loglk, 1, 5)
+ "\n");
}
}
return text.toString();
}
/**
* Print the cluster assignments for either the training
* or the testing data.
*
* @param clusterer the clusterer to use for cluster assignments
* @return a string containing the instance indexes and cluster assigns.
* @exception if cluster assignments can't be printed
*/
private static String printClusterings (Clusterer clusterer, Instances train,
String testFileName, Range attributesToOutput)
throws Exception {
StringBuffer text = new StringBuffer();
int i = 0;
int cnum;
if (testFileName.length() != 0) {
BufferedReader testStream = null;
try {
testStream = new BufferedReader(new FileReader(testFileName));
}
catch (Exception e) {
throw new Exception("Can't open file " + e.getMessage() + '.');
}
Instances test = new Instances(testStream, 1);
while (test.readInstance(testStream)) {
try {
cnum = clusterer.clusterInstance(test.instance(0));
text.append(i + " " + cnum + " "
+ attributeValuesString(test.instance(0), attributesToOutput) + "\n");
}
catch (Exception e) {
/* throw new Exception('\n' + "Unable to cluster instance\n"
+ e.getMessage()); */
text.append(i + " Unclustered "
+ attributeValuesString(test.instance(0), attributesToOutput) + "\n");
}
test.delete(0);
i++;
}
}
else// output for training data
{
for (i = 0; i < train.numInstances(); i++) {
try {
cnum = clusterer.clusterInstance(train.instance(i));
text.append(i + " " + cnum + " "
+ attributeValuesString(train.instance(i), attributesToOutput)
+ "\n");
}
catch (Exception e) {
/* throw new Exception('\n'
+ "Unable to cluster instance\n"
+ e.getMessage()); */
text.append(i + " Unclustered "
+ attributeValuesString(train.instance(i), attributesToOutput)
+ "\n");
}
}
}
return text.toString();
}
/**
* Builds a string listing the attribute values in a specified range of indices,
* separated by commas and enclosed in brackets.
*
* @param instance the instance to print the values from
* @param attributes the range of the attributes to list
* @return a string listing values of the attributes in the range
*/
private static String attributeValuesString(Instance instance, Range attRange) {
StringBuffer text = new StringBuffer();
if (attRange != null) {
boolean firstOutput = true;
attRange.setUpper(instance.numAttributes() - 1);
for (int i=0; i<instance.numAttributes(); i++)
if (attRange.isInRange(i)) {
if (firstOutput) text.append("(");
else text.append(",");
text.append(instance.toString(i));
firstOutput = false;
}
if (!firstOutput) text.append(")");
}
return text.toString();
}
/**
* Make up the help string giving all the command line options
*
* @param clusterer the clusterer to include options for
* @return a string detailing the valid command line options
*/
private static String makeOptionString (Clusterer clusterer) {
StringBuffer optionsText = new StringBuffer("");
// General options
optionsText.append("\n\nGeneral options:\n\n");
optionsText.append("-t <name of training file>\n");
optionsText.append("\tSets training file.\n");
optionsText.append("-T <name of test file>\n");
optionsText.append("-l <name of input file>\n");
optionsText.append("\tSets model input file.\n");
optionsText.append("-d <name of output file>\n");
optionsText.append("\tSets model output file.\n");
optionsText.append("-p <attribute range>\n");
optionsText.append("\tOutput predictions. Predictions are for "
+ "training file"
+ "\n\tif only training file is specified,"
+ "\n\totherwise predictions are for the test file."
+ "\n\tThe range specifies attribute values to be output"
+ "\n\twith the predictions. Use '-p 0' for none.\n");
optionsText.append("-x <number of folds>\n");
optionsText.append("\tOnly Distribution Clusterers can be cross "
+ "validated.\n");
optionsText.append("-s <random number seed>\n");
optionsText.append("-c <class index>\n");
optionsText.append("\tSet class attribute. If supplied, class is ignored");
optionsText.append("\n\tduring clustering but is used in a classes to");
optionsText.append("\n\tclusters evaluation.\n");
// Get scheme-specific options
if (clusterer instanceof OptionHandler) {
optionsText.append("\nOptions specific to "
+ clusterer.getClass().getName() + ":\n\n");
Enumeration enum = ((OptionHandler)clusterer).listOptions();
while (enum.hasMoreElements()) {
Option option = (Option)enum.nextElement();
optionsText.append(option.synopsis() + '\n');
optionsText.append(option.description() + "\n");
}
}
return optionsText.toString();
}
/**
* Main method for testing this class.
*
* @param args the options
*/
public static void main (String[] args) {
try {
if (args.length == 0) {
throw new Exception("The first argument must be the name of a "
+ "clusterer");
}
String ClustererString = args[0];
args[0] = "";
Clusterer newClusterer = Clusterer.forName(ClustererString, null);
System.out.println(evaluateClusterer(newClusterer, args));
}
catch (Exception e) {
System.out.println(e.getMessage());
}
}
}