BasicDeduper.java example

Explorer
wekax-master
- weka-3-6-2
- wekaUT
  - GetAllSubPackages.java
  - weka
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    BasicDeduper.java
 *    Copyright (C) 2003 Mikhail Bilenko
 *
 */

package weka.deduping;

import weka.core.*;
import weka.deduping.metrics.*;
import weka.deduping.blocking.*;

import java.io.Serializable;
import java.util.*;

import weka.clusterers.Cluster;

/** A basic deduper class that takes a set of objects and
 * identifies disjoint subsets of duplicates
 *
 * @author Mikhail Bilenko (mbilenko@cs.utexas.edu)
 * @version $Revision: 1.7 $
 */
public class BasicDeduper extends Deduper implements OptionHandler, Serializable {

  /** A metric measuring similarity between every pair of instances */
  InstanceMetric m_metric = new SumInstanceMetric();

  /** The proportion of the training fold that should be used for training*/
  protected double  m_trainProportion = 1.0;
  
  /** distance matrix containing the distance between each pair */
  protected double[][] m_distanceMatrix = null;
  
  /** instance hash, where each Integer index is hashed to an instance */
  protected HashMap m_instancesHash = null;

  /** reverse instance hash, where each instance is hashed to its Integer index */
  protected HashMap m_reverseInstancesHash = null;

  /** the attribute indeces on which to do deduping */
  protected int[] m_attrIdxs = null;

  /** The total number of true objects */
  protected int m_numObjects = 0;
  
  /** An array containing class values for instances (for faster statistics) */
  protected double[] m_classValues = null;

  /** Number of clusters in the process*/
  protected int m_numCurrentObjects = 0;

  /** holds the clusters */
  protected ArrayList m_clusters = null;

  /** A set of instances to dedupe */
  protected Instances m_testInstances = null;

  /** Use blocking ? */
  protected boolean m_useBlocking = false; 
  
  /**
   * temporary variable holding cluster assignments
   */
  protected int [] m_clusterAssignments;

  /** verbose? */
  protected boolean m_debug = false;

  /** Statistics */
  protected int m_numTotalPairs = 0;
  protected int m_numGoodPairs = 0;
  protected int m_numTruePairs = 0;

  protected int m_numTotalPairsTrain = 0;  // the overall number of pairs in the test split
  protected int m_numTotalPairsTest = 0;  // the overall number of pairs in the test split

  protected int m_numPotentialDupePairsTrain = 0;
  protected int m_numActualDupePairsTrain = 0;
  protected int m_numPotentialNonDupePairsTrain = 0;
  protected int m_numActualNonDupePairsTrain = 0;

  protected double m_trainTime = 0;
  protected double m_testTimeStart = 0;
  
  /** Given training data, build the metrics required by the deduper
   * @param train a set of training data
   */
  public void buildDeduper(Instances trainFold, Instances testInstances) throws Exception {
    Instances trainInstances = getTrainingSet(trainFold);
    m_numTotalPairsTrain = trainInstances.numInstances() * (trainInstances.numInstances() - 1) / 2;
    m_numPotentialDupePairsTrain = numTruePairs(trainInstances);
    m_numPotentialNonDupePairsTrain = m_numTotalPairsTrain - m_numPotentialDupePairsTrain;

    // if the indexes have not been set, use all except for class
    if (m_attrIdxs == null) {
      m_attrIdxs = new int[trainInstances.numAttributes() - 1];
      int classIdx = trainInstances.classIndex();
      int counter = 0;
      for (int i = 0; i < m_attrIdxs.length + 1; i++) {
	if (i != classIdx) {
	  m_attrIdxs[counter++] = i;
	}
      } 
    }

    // train the instance metric
    long trainTimeStart = System.currentTimeMillis();
    m_metric.buildInstanceMetric(m_attrIdxs);
    m_metric.trainInstanceMetric(trainInstances, testInstances);

    // get training statistics
    m_numActualDupePairsTrain = m_metric.getNumActualPosPairs();   
    m_numActualNonDupePairsTrain = m_metric.getNumActualNegPairs();  
      
    m_trainTime = (System.currentTimeMillis() - trainTimeStart)/1000.0;
    m_distanceMatrix = null;
  }

  /** Identify duplicates within the testing data
   * @param testInstances a set of instances among which to identify duplicates
   * @param numObjects the number of "true object" sets to create
   * @return a list of object sets
   */
  public void findDuplicates(Instances testInstances, int numObjects) throws Exception {
    m_numObjects = testInstances.numClasses();
    m_numTruePairs = numTruePairs(testInstances);
    m_numTotalPairsTest = testInstances.numInstances() * (testInstances.numInstances() -1) / 2;
    resetStatistics();
    hashInstances(testInstances);
    createDistanceMatrix();

    // assign instances to singleton clusters
    m_numCurrentObjects = testInstances.numInstances();
    m_clusterAssignments = new int[testInstances.numInstances()];
    for (int i = 0; i < testInstances.numInstances(); i++) {
      m_clusterAssignments[i] = i;
    }

    // initialize m_clusters arraylist
    initIntClusters();
    if (m_debug) {
      System.out.println("Starting with  " + (m_numCurrentObjects) + " clusters; " + m_clusters.size() +
			 "actual clusters; " + numObjects + " true objects desired");
    }
    // merge clusters until desired number of clusters is reached
    while (m_numCurrentObjects > numObjects) {
      if (m_debug) {
	System.out.println("Merging with " + (m_numCurrentObjects) + " clusters left");
      }
      mergeStep();
    }
    System.out.println("Done deduping with " + m_clusters.size() + " clusters");
  }

    /**
   * Computes the clusters from the cluster assignments
   * 
   * @exception Exception if clusters could not be computed successfully
   */    
  public ArrayList initIntClusters() throws Exception {
    m_clusters = new ArrayList();

    for (int i=0; i < m_testInstances.numInstances(); i++) {
      m_clusters.add(new Cluster(new Integer(i)));
    }

    return m_clusters;
  }
  
  /**
   * Internal method that finds two most similar clusters and merges them
   */
  protected void mergeStep() throws Exception{
    double bestDistance = Double.MAX_VALUE;
    double thisDistance;
    Cluster thisCluster, nextCluster;
    ArrayList mergeCandidatesList = new ArrayList();
    int cluster1_index, cluster2_index;

    if (m_debug) { 
      System.out.println("\nBefore merge step there are " + m_clusters.size() +
			 " clusters; m_numCurrentObjects=" + m_numCurrentObjects);
    }
    // find two most similar clusters 
    for (int i = 0; i < m_clusters.size()-1; i++){
      thisCluster = (Cluster)m_clusters.get(i);
      for (int j = i+1; j < m_clusters.size(); j++) {
	thisDistance = clusterDistance(thisCluster, (Cluster) m_clusters.get(j));
	// If there is a tie, add to the list of top distances
	if (thisDistance == bestDistance) {
	  mergeCandidatesList.add(new Integer(i));
	  mergeCandidatesList.add(new Integer(j));
	} else if (thisDistance < bestDistance) {  // this is the best distance seen this far
	  mergeCandidatesList.clear();
	  mergeCandidatesList.add(new Integer(i));
	  mergeCandidatesList.add(new Integer(j));
	  bestDistance = thisDistance;
	}
      }
    }

    // randomly pick a most similar pair from the list of candidates
    int i1 = (int) (mergeCandidatesList.size() * Math.random());
    int i2 = (i1 % 2 > 0) ? (i1 - 1) : (i1 + 1);
    int cluster1Idx = ((Integer) mergeCandidatesList.get(i1)).intValue();
    int cluster2Idx = ((Integer) mergeCandidatesList.get(i2)).intValue();
    if (m_debug) {
      System.out.println("\nMerging clusters " + cluster1Idx + "(" + ((Cluster)m_clusters.get(cluster1Idx)).get(0) + 
			 ") and " + cluster2Idx + "(" + ((Cluster)m_clusters.get(cluster2Idx)).get(0) + 
			 ");  distance=" + bestDistance +
			 " actual=" + clusterDistance((Cluster)m_clusters.get(cluster1Idx),
						      (Cluster)m_clusters.get(cluster2Idx)));
      Instance in1 =  (Instance) m_instancesHash.get(((Cluster)m_clusters.get(cluster1Idx)).get(0));
      Instance in2 =  (Instance) m_instancesHash.get(((Cluster)m_clusters.get(cluster2Idx)).get(0));
      if (in1.classValue() == in2.classValue()) {
	System.out.println("good: " + bestDistance + "\t" + in1 + "\tand" + in2);
      } else {
	System.out.println("BAD:  " + bestDistance + "\t" + in1 + "\tand" + in2);
      } 
    }

    Cluster newCluster = mergeClusters(cluster1Idx, cluster2Idx);
    // have to remove in order because we're using index, argh
    if (cluster1Idx > cluster2Idx) {
      m_clusters.remove(cluster1Idx);
      m_clusters.remove(cluster2Idx);
    } else {
      m_clusters.remove(cluster2Idx);
      m_clusters.remove(cluster1Idx);
    }
    m_clusters.add(newCluster);
    m_numCurrentObjects--;
  }

    /**
   * internal method that returns the distance between two clusters
   */
  protected double clusterDistance(Cluster cluster1, Cluster cluster2) {
    if (cluster2 == null || cluster1 == null) {
      try{
	printIntClusters();
      } catch(Exception e){}
    }
	
    int i1 = ((Integer) cluster1.get(0)).intValue();
    int i2 = ((Integer) cluster2.get(0)).intValue();
    return m_distanceMatrix[i1][i2];
  }

     boolean fuckedUp = false;
    /** Internal method to merge two clusters and update distances
   */
  protected Cluster mergeClusters (int cluster1Idx, int cluster2Idx) throws Exception {

    Cluster newCluster = new Cluster();
    Cluster cluster1 = (Cluster) m_clusters.get(cluster1Idx);
    Cluster cluster2 = (Cluster) m_clusters.get(cluster2Idx);
    int cluster1FirstIdx =((Integer) cluster1.get(0)).intValue();
    int cluster2FirstIdx =((Integer) cluster2.get(0)).intValue(); 
    newCluster.copyElements(cluster1);
    newCluster.copyElements(cluster2);
    
    // Update the distance matrix depending on the linkage type
    // go through all clusters and update the distance from first element
    // to the first element of the new cluster
    for (int i = 0; i < m_clusters.size(); i++){
      if (i != cluster1Idx && i != cluster2Idx) { // skip these clusters themselves
	Cluster currentCluster = (Cluster) m_clusters.get(i);
	int currClusterFirstIdx = ((Integer) currentCluster.get(0)).intValue();

	if (m_distanceMatrix[cluster1FirstIdx][currClusterFirstIdx] <=
	    m_distanceMatrix[cluster2FirstIdx][currClusterFirstIdx]) {
	  // first cluster is closer, no need to update
	  // but just in case:
//  	  m_distanceMatrix[cluster2FirstIdx][currClusterFirstIdx] =
//  	    m_distanceMatrix[currClusterFirstIdx][cluster2FirstIdx] =
//  	    m_distanceMatrix[cluster1FirstIdx][currClusterFirstIdx];
	} else {
	  // second cluster is closer, must update distance between the first representative
	  m_distanceMatrix[cluster1FirstIdx][currClusterFirstIdx] =
	    m_distanceMatrix[currClusterFirstIdx][cluster1FirstIdx] =
	    m_distanceMatrix[cluster2FirstIdx][currClusterFirstIdx];
	}
	// check for infinity links
	if (m_distanceMatrix[cluster2FirstIdx][currClusterFirstIdx] == Double.POSITIVE_INFINITY) {
	  m_distanceMatrix[cluster1FirstIdx][currClusterFirstIdx] =
	    m_distanceMatrix[currClusterFirstIdx][cluster1FirstIdx] = Double.POSITIVE_INFINITY;
	}
	if (m_distanceMatrix[cluster1FirstIdx][currClusterFirstIdx] == Double.POSITIVE_INFINITY) {
	  m_distanceMatrix[cluster2FirstIdx][currClusterFirstIdx] =
	    m_distanceMatrix[currClusterFirstIdx][cluster2FirstIdx] = Double.POSITIVE_INFINITY;
	}
      }
    }
    
    // update pair statistics
    m_numTotalPairs += cluster1.size() * cluster2.size();
    int newGoodPairs = numCrossClusterTruePairs(cluster1, cluster2); 
    m_numGoodPairs += newGoodPairs;
    accumulateStatistics();

    return newCluster;
  }
  
  /**
   * Create the hashtable from given Instances;
   * keys are numeric indeces, values are actual Instances
   *
   * @param data Instances
   *
   */
  protected void hashInstances (Instances data) {
    m_testInstances = data;
    m_instancesHash = new HashMap();
    m_reverseInstancesHash = new HashMap();
    m_classValues = new double[data.numInstances()];
        
    for (int i = 0; i < data.numInstances(); i++) {
      Instance instance = data.instance(i);
      m_classValues[i] = instance.classValue();
      if (!m_instancesHash.containsValue(instance)) {
	Integer idx = new Integer(i);
	m_instancesHash.put(idx, instance);
	m_reverseInstancesHash.put(instance, idx);
      } else {
	System.out.println("STupid fuck, dupe! " + i);
      }
    }
  }
  

  /**
   * Fill the distance matrix with values using the metric
   *
   */
  protected void createDistanceMatrix () throws Exception {
    int n = m_instancesHash.size();
    double sim;
	
    m_distanceMatrix = new double[n][n];

    if (m_useBlocking) {
      for (int i = 0; i < n; i++) {
	Arrays.fill(m_distanceMatrix[i], Double.MAX_VALUE);
      }
      
      Blocking blocker = new Blocking();
      blocker.buildIndex(m_testInstances);
      InstancePair[] pairs = blocker.getMostSimilarPairs(m_testInstances.numClasses() * 50);
      for (int i = 0; i < pairs.length && pairs[i] != null; i++) {
	int idx1 = ((Integer) m_reverseInstancesHash.get(pairs[i].instance1)).intValue();
	int idx2 = ((Integer) m_reverseInstancesHash.get(pairs[i].instance2)).intValue();
	m_distanceMatrix[idx1][idx2] = m_distanceMatrix[idx1][idx2] = pairs[i].value;
      }
    }
    
    for (int i = 0; i < n; i++) {
      for (int j = i+1; j < n; j++) {
	if (!m_useBlocking || m_distanceMatrix[i][j] != Double.MAX_VALUE) {
	  m_distanceMatrix[i][j] = m_distanceMatrix[j][i] =
	    m_metric.distance((Instance) m_instancesHash.get(new Integer(i)),
			      (Instance) m_instancesHash.get(new Integer(j)));
	  Instance i1 = (Instance) m_instancesHash.get(new Integer(i));
	  Instance j1 = (Instance) m_instancesHash.get(new Integer(j));
	}
      }
    }
  }

    /** Outputs the current clustering
   *
   * @exception Exception if something goes wrong
   */
  public void printIntClusters() throws Exception {
    if (m_clusters == null)
      throw new Exception ("Clusters were not created");

    for (int i = 0; i < m_clusters.size(); i++) {
      Cluster cluster = (Cluster) m_clusters.get(i);
      System.out.println ("Cluster " + i + " consists of " + cluster.size() + " elements");
      for (int j = 0; j < cluster.size(); j++) {
	//		Instance instance = (Instance) m_instancesHash.get((Integer) cluster.elementAt(j));
	Integer idx = (Integer) cluster.get(j);
	Instance instance = (Instance) m_instancesHash.get(idx);
	System.out.println("\t\t" + instance);
      }
    }
  }

  /** A helper function that stratifies a training set and selects a proportion of
   * true objects for training
   * @param instances a set of instances from which to select the training data
   * @return a subset of those instances
   */
  Instances getTrainingSet(Instances instances) {
    HashMap classHash = new HashMap();
    int numTotalInstances = instances.numInstances();
    Random rand = new Random(numTotalInstances);
    Instances trainInstances = new Instances(instances, (int) (m_trainProportion * numTotalInstances));

    // hash each class 
    for (int i=0; i < instances.numInstances(); i++) {
      Instance instance = instances.instance(i);
      Double classValue = new Double(instance.classValue());
      if (classHash.containsKey(classValue)) {
	ArrayList list = (ArrayList) classHash.get(classValue);
	list.add(instance);
      } else {
	// this class has not been seen before, create an entry for it
	ArrayList list = new ArrayList();
	list.add(instance);
	classHash.put(classValue, list);
      }
    }

    // select a desired proportion of classes
    ArrayList[] classes = new ArrayList[classHash.size()];
    classes = (ArrayList[]) classHash.values().toArray(classes);
    int numClasses = classes.length;
    int[] indeces = PairwiseSelector.randomSubset((int) (m_trainProportion * numClasses), numClasses);

    for (int i = 0; i < indeces.length; i++) {
      for (int j = 0; j < classes[i].size(); j++) {
	Instance instance = (Instance) classes[i].get(j);
	trainInstances.add(instance);
      }
    } 

    return trainInstances;
  }

  /** Set the amount of training
   * @param trainProportion the proportion of the training set that will be used for learning
   */
  public void setTrainProportion(double trainProportion) {
    m_trainProportion = trainProportion;
  }

  /** Get the amount of training
   * @return the proportion of the training set that will be used for learning
   */
  public double getTrainProportion() {
    return m_trainProportion;
  }


  /** Given a test set, calculate the number of true pairs
   * @param instances a set of objects, class has the true object ID
   * @returns the number of true same-class pairs
   */
  protected int numTruePairs(Instances instances) {
    int numTruePairs = 0;
    // get the class counts
    HashMap classCountMap = new HashMap();

    for (int i = 0; i < instances.numInstances(); i++) {
      Instance instance = instances.instance(i);
      Double classValue = new Double(instance.classValue());
      if (classCountMap.containsKey(classValue)) {
	Integer counts = (Integer) classCountMap.get(classValue);
	classCountMap.put(classValue, new Integer(counts.intValue() + 1));
      } else {
	classCountMap.put(classValue, new Integer(1));
      }
    }
    
    // calculate the number of pairs
    Iterator iterator = classCountMap.values().iterator();
    while (iterator.hasNext()) {
      int counts = ((Integer) iterator.next()).intValue();
      numTruePairs += counts * (counts - 1) / 2;
    }

    return numTruePairs;
  }

  /** Given two clusters, calculate the number of true pairs that
   * will be added when the clusters are merged
   * @param cluster1 the first cluster to merge
   * @param cluster2 the second cluster to merge
   * @returns the number of true pairs that will appear once clusters are merged
   */
  protected int numCrossClusterTruePairs(Cluster cluster1, Cluster cluster2) {
    int numCCTruePairs = 0;
    int[] classCounts1 = new int[m_numObjects];
    for (int i = 0; i < cluster1.size(); i++) {
      Integer instanceIdx = (Integer) cluster1.get(i);
      classCounts1[(int)m_classValues[instanceIdx.intValue()]]++;
    }

    int[] classCounts2 = new int[m_numObjects];
    for (int i = 0; i < cluster2.size(); i++) {
      Integer instanceIdx = (Integer) cluster2.get(i);
      classCounts2[(int)m_classValues[instanceIdx.intValue()]]++;
    }

    for (int i = 0; i < m_numObjects; i++) {
      numCCTruePairs += classCounts1[i] * classCounts2[i];
      if (classCounts1[i] != 0 || classCounts2[i] != 0) { 
//  	System.out.println(i + "\t" + classCounts1[i] + "\t" + classCounts2[i]);
      }
    }
    return numCCTruePairs;
  } 

  /** Add the current state of things to statistics */
  protected void accumulateStatistics() {
    Object[] currentStats = new Object[16];

    double precision = (m_numGoodPairs+0.0)/m_numTotalPairs;
    double recall = (m_numGoodPairs+0.0)/m_numTruePairs;

    double fmeasure = 0;
    if (precision > 0) {  // avoid divide by zero in the p=0&r=0 case
      fmeasure = 2 * (precision * recall) / (precision + recall);
    }

    int statIdx = 0;
    currentStats[statIdx++] = new Double(m_numCurrentObjects);

    // Accuracy statistics
    currentStats[statIdx++] = new Double(recall);
    currentStats[statIdx++] = new Double(precision);
    currentStats[statIdx++] = new Double(fmeasure);

    // Dupe density statistics
    currentStats[statIdx++] = new Double(m_numTotalPairsTrain);
    currentStats[statIdx++] = new Double(m_numPotentialDupePairsTrain);
    currentStats[statIdx++] = new Double(m_numActualDupePairsTrain);
    currentStats[statIdx++] = new Double(m_numPotentialNonDupePairsTrain);
    currentStats[statIdx++] = new Double(m_numActualNonDupePairsTrain);
    currentStats[statIdx++] = new Double((m_numActualNonDupePairsTrain > 0) ?
					 ((m_numActualDupePairsTrain+0.0)/m_numActualNonDupePairsTrain) : 0);
    currentStats[statIdx++] = new Double((m_numPotentialDupePairsTrain+0.0)/m_numTotalPairsTrain);
    currentStats[statIdx++] = new Double(m_numTotalPairsTest);    
    currentStats[statIdx++] = new Double(m_numTruePairs);     
    currentStats[statIdx++] = new Double((m_numTruePairs + 0.0)/m_numTotalPairsTest);

    // Timing statistics
    currentStats[statIdx++] = new Double(m_trainTime);
    currentStats[statIdx++] = new Double((System.currentTimeMillis() - m_testTimeStart)/1000.0);

    m_statistics.add(currentStats);
  }

  /** Reset the current statistics */
  protected void resetStatistics() {
    m_statistics = new ArrayList();
    m_numGoodPairs = 0;
    m_numTotalPairs = 0;
    m_testTimeStart = System.currentTimeMillis();
  } 



  /** Set the InstanceMetric that is used
   * @param metric the InstanceMetric that is used to dedupe
   */
  public void setMetric(InstanceMetric metric) {
    m_metric = metric;
  }


  /** Get the InstanceMetric that is used
   * @return the InstanceMetric that is used to dedupe
   */
  public InstanceMetric getMetric() {
    return m_metric;
  }

  /** Turn debugging output on/off
   * @param debug if true, debugging info will be printed
   */
  public void setDebug(boolean debug) {
    m_debug = debug;
  }

  /** See whether debugging output is on/off
   * @returns if true, debugging info will be printed
   */
  public boolean getDebug() {
    return m_debug;
  }

  /** Turn debugging output on/off
   * @param debug if true, blocking is on
   */
  public void setUseBlocking(boolean useBlocking) {
    m_useBlocking = useBlocking;
  }

  /** See whether blocking is on/off
   * @returns if true, blocking is on
   */
  public boolean getUseBlocking() {
    return m_useBlocking;
  }

  
    /**
   * Returns an enumeration describing the available options
   *
   * @return an enumeration of all the available options
   **/
  public Enumeration listOptions() {
    Vector newVector = new Vector(2);
    newVector.addElement(new Option("\tMetric.\n"
				    +"\t(default=ClassifierInstanceMetric)", "M", 1,"-M metric_name metric_options"));
    return newVector.elements();
  }

  /**
   * Parses a given list of options.
   *
   * Valid options are:<p>
   *
   * -M metric options <p>
   * InstanceMetric used <p>
   *
   * @param options the list of options as an array of strings
   * @exception Exception if an option is not supported
   *
   **/
  public void setOptions(String[] options) throws Exception {
    String optionString;

    String metricString = Utils.getOption('M', options);
    if (metricString.length() != 0) {
      String[] metricSpec = Utils.splitOptions(metricString);
      String metricName = metricSpec[0]; 
      metricSpec[0] = "";
      System.out.println("Metric name: " + metricName + "\nMetric parameters: " + concatStringArray(metricSpec));
      setMetric(InstanceMetric.forName(metricName, metricSpec));
    }
  }


  /**
   * Gets the current settings of Greedy Agglomerative Clustering
   *
   * @return an array of strings suitable for passing to setOptions()
   */
  public String [] getOptions() {
    String [] options = new String [250];
    int current = 0;

    if (m_useBlocking == false) { 
      options[current++] = "-NB"; 
    }

    options[current++] = "-T";
    options[current++] = "" + m_trainProportion;

    if (m_debug) {
      options[current++] = "-D";
    }

    options[current++] = "-M";
    options[current++] = Utils.removeSubstring(m_metric.getClass().getName(), "weka.deduping.metrics.");
    if (m_metric instanceof OptionHandler) {
      String[] metricOptions = ((OptionHandler)m_metric).getOptions();
      for (int i = 0; i < metricOptions.length; i++) {
	options[current++] = metricOptions[i];
      }
    } 

    while (current < options.length) {
      options[current++] = "";
    }

    return options;
  }

  /** A little helper to create a single String from an array of Strings
   * @param strings an array of strings
   * @returns a single concatenated string
   */
  public static String concatStringArray(String[] strings) {
    StringBuffer buffer = new StringBuffer();
    for (int i = 0; i < strings.length; i++) {
      buffer.append(strings[i]);
      buffer.append(" ");
    }
    return buffer.toString();
  } 
}