/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * DeduperSplitEvaluator.java * Copyright (C) 2003 Mikhail Bilenko * */ package weka.experiment; import java.io.*; import java.util.*; import weka.core.*; import weka.deduping.*; /** * A SplitEvaluator that produces results for a deduper scheme * on a nominal class attribute. * * -W classname <br> * Specify the full class name of the deduper to evaluate. <p> * * @author Mikhail Bilenko (mbilenko@cs.utexas.edu) * @version $Revision: 1.2 $ */ public class DeduperSplitEvaluator implements SplitEvaluator, OptionHandler { /** The deduper used for evaluation */ protected Deduper m_deduper = new BasicDeduper(); /** Holds the statistics for the most recent application of the deduper */ protected String m_result = null; /** The deduper options (if any) */ protected String m_deduperOptions = ""; /** The deduper version */ protected String m_deduperVersion = ""; /** The length of a key */ private static final int KEY_SIZE = 3; /** The length of a result */ private static final int RESULT_SIZE = 16; /** * No args constructor. */ public DeduperSplitEvaluator() { updateOptions(); } /** * Returns a string describing this split evaluator * @return a description of the split evaluator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return " A SplitEvaluator that produces results for a deduper " +"scheme on a nominal class attribute."; } /** Does nothing, since deduping evaluation does not allow additional measures */ public void setAdditionalMeasures(String [] additionalMeasures){} /** * Returns an enumeration describing the available options.. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector newVector = new Vector(2); newVector.addElement(new Option( "\tThe full class name of the deduper.\n" +"\teg: weka.dedupers.BasicDeduper", "W", 1, "-W <class name>")); if ((m_deduper != null) && (m_deduper instanceof OptionHandler)) { newVector.addElement(new Option( "", "", 0, "\nOptions specific to dedupers " + m_deduper.getClass().getName() + ":")); Enumeration enum = ((OptionHandler)m_deduper).listOptions(); while (enum.hasMoreElements()) { newVector.addElement(enum.nextElement()); } } return newVector.elements(); } /** * Parses a given list of options. Valid options are:<p> * * -W classname <br> * Specify the full class name of the deduper to evaluate. <p> * * All option after -- will be passed to the deduper. * * @param options the list of options as an array of strings * @exception Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String cName = Utils.getOption('W', options); if (cName.length() == 0) { throw new Exception("A deduper must be specified with" + " the -W option."); } // Do it first without options, so if an exception is thrown during // the option setting, listOptions will contain options for the actual // Deduper. setDeduper(Deduper.forName(cName, null)); if (getDeduper() instanceof OptionHandler) { ((OptionHandler) getDeduper()) .setOptions(Utils.partitionOptions(options)); updateOptions(); } } /** * Gets the current settings of the Deduper. * * @return an array of strings suitable for passing to setOptions */ public String [] getOptions() { String [] deduperOptions = new String [0]; if ((m_deduper != null) && (m_deduper instanceof OptionHandler)) { deduperOptions = ((OptionHandler)m_deduper).getOptions(); } String [] options = new String [deduperOptions.length + 5]; int current = 0; if (getDeduper() != null) { options[current++] = "-W"; options[current++] = getDeduper().getClass().getName(); } options[current++] = "--"; System.arraycopy(deduperOptions, 0, options, current, deduperOptions.length); current += deduperOptions.length; while (current < options.length) { options[current++] = ""; } return options; } /** * Gets the data types of each of the key columns produced for a single run. * The number of key fields must be constant * for a given SplitEvaluator. * * @return an array containing objects of the type of each key column. The * objects should be Strings, or Doubles. */ public Object [] getKeyTypes() { Object [] keyTypes = new Object[KEY_SIZE]; keyTypes[0] = ""; keyTypes[1] = ""; keyTypes[2] = ""; return keyTypes; } /** * Gets the names of each of the key columns produced for a single run. * The number of key fields must be constant * for a given SplitEvaluator. * * @return an array containing the name of each key column */ public String [] getKeyNames() { String [] keyNames = new String[KEY_SIZE]; keyNames[0] = "Scheme"; keyNames[1] = "Scheme_options"; keyNames[2] = "Scheme_version_ID"; return keyNames; } /** * Gets the key describing the current SplitEvaluator. For example * This may contain the name of the deduper used for deduper * predictive evaluation. The number of key fields must be constant * for a given SplitEvaluator. * * @return an array of objects containing the key. */ public Object [] getKey(){ Object [] key = new Object[KEY_SIZE]; key[0] = m_deduper.getClass().getName(); key[1] = m_deduperOptions; key[2] = m_deduperVersion; return key; } /** * Gets the data types of each of the result columns produced for a * single run. The number of result fields must be constant * for a given SplitEvaluator. * * @return an array containing objects of the type of each result column. * The objects should be Strings, or Doubles. */ public Object [] getResultTypes() { int overall_length = RESULT_SIZE; Object [] resultTypes = new Object[overall_length]; Double doub = new Double(0); int current = 0; resultTypes[current++] = doub; // Accuracy stats resultTypes[current++] = doub; resultTypes[current++] = doub; resultTypes[current++] = doub; // Dupe density stats resultTypes[current++] = doub; resultTypes[current++] = doub; resultTypes[current++] = doub; resultTypes[current++] = doub; resultTypes[current++] = doub; resultTypes[current++] = doub; resultTypes[current++] = doub; resultTypes[current++] = doub; resultTypes[current++] = doub; resultTypes[current++] = doub; // Timing stats resultTypes[current++] = doub; resultTypes[current++] = doub; if (current != overall_length) { throw new Error("ResultTypes didn't fit RESULT_SIZE"); } return resultTypes; } /** * Gets the names of each of the result columns produced for a single run. * The number of result fields must be constant * for a given SplitEvaluator. * * @return an array containing the name of each result column */ public String [] getResultNames() { int overall_length = RESULT_SIZE; String [] resultNames = new String[overall_length]; int current = 0; resultNames[current++] = "Number_of_instances"; // Accuracy stats resultNames[current++] = "Recall"; resultNames[current++] = "Precision"; resultNames[current++] = "Fmeasure"; // Dupe density stats resultNames[current++] = "TotalPairsTrain"; resultNames[current++] = "PotentialDupePairsTrain"; resultNames[current++] = "ActualDupePairsTrain"; resultNames[current++] = "PotentialNonDupePairsTrain"; resultNames[current++] = "ActualNonDupePairsTrain"; resultNames[current++] = "DupeNonDupeRatioTrain"; resultNames[current++] = "DupeOveralProportionTrain"; resultNames[current++] = "TotalPairsTest"; resultNames[current++] = "DupePairsTest"; resultNames[current++] = "DupeNonDupeRatioTest"; // Timing stats resultNames[current++] = "Time_training"; resultNames[current++] = "Time_testing"; if (current != overall_length) { throw new Error("ResultNames didn't fit RESULT_SIZE"); } return resultNames; } /** * Gets the results for the supplied train and test datasets. * * @param train the training Instances. * @param test the testing Instances. * @return the raw results stored in an array. The objects stored in * the array are object arrays, containing actual P/R/FM values for each point * @exception Exception if a problem occurs while getting the results */ public Object [] getResult(Instances trainData, Instances testData) throws Exception { if (trainData.classAttribute().type() != Attribute.NOMINAL) { throw new Exception("Class attribute is not nominal!"); } if (m_deduper == null) { throw new Exception("No deduper has been specified"); } DedupingEvaluation eval = new DedupingEvaluation(); eval.trainDeduper(m_deduper, trainData, testData); ArrayList rawResultList = eval.evaluateModel(m_deduper, testData); return rawResultList.toArray(); } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String deduperTipText() { return "The deduper to use."; } /** * Get the value of Deduper. * * @return Value of Deduper. */ public Deduper getDeduper() { return m_deduper; } /** * Sets the deduper. * * @param newDeduper the new deduper to use. */ public void setDeduper(Deduper newDeduper) { m_deduper = newDeduper; updateOptions(); } /** * Updates the options that the current deduper is using. */ protected void updateOptions() { if (m_deduper instanceof OptionHandler) { m_deduperOptions = Utils.joinOptions(((OptionHandler)m_deduper) .getOptions()); } else { m_deduperOptions = ""; } if (m_deduper instanceof Serializable) { ObjectStreamClass obs = ObjectStreamClass.lookup(m_deduper .getClass()); m_deduperVersion = "" + obs.getSerialVersionUID(); } else { m_deduperVersion = ""; } } /** * Set the Deduper to use, given it's class name. A new deduper will be * instantiated. * * @param newDeduper the Deduper class name. * @exception Exception if the class name is invalid. */ public void setDeduperName(String newDeduperName) throws Exception { try { setDeduper((Deduper)Class.forName(newDeduperName) .newInstance()); } catch (Exception ex) { throw new Exception("Can't find Deduper with class name: " + newDeduperName); } } /** * Gets the raw output from the deduper * @return the raw output from the deduper */ public String getRawResultOutput() { StringBuffer result = new StringBuffer(); if (m_deduper == null) { return "<null> deduper"; } result.append(toString()); result.append("Deduper model: \n"+m_deduper.toString()+'\n'); // append the performance statistics if (m_result != null) { result.append(m_result); } return result.toString(); } /** * Returns a text description of the split evaluator. * * @return a text description of the split evaluator. */ public String toString() { String result = "DeduperSplitEvaluator: "; if (m_deduper == null) { return result + "<null> deduper"; } return result + m_deduper.getClass().getName() + " " + m_deduperOptions + "(version " + m_deduperVersion + ")"; } } // DeduperSplitEvaluator