/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* InstanceMetric.java
* Copyright (C) 2003 Mikhail Bilenko
*
*/
package weka.deduping.metrics;
import java.util.ArrayList;
import java.io.Serializable;
import weka.core.*;
/**
* Abstract InstanceMetric class for writing metrics that
* calculate distance between instances describing database records
*
* @author Mikhail Bilenko (mbilenko@cs.utexas.edu)
* @version $Revision: 1.2 $
*/
public abstract class InstanceMetric {
/** indeces of attributes which the metric works on */
protected int [] m_attrIdxs = null;
protected StringMetric [][] m_metrics = null;
/** index of the class attribute */
protected int m_classIndex = -1;
/** The actual number of training pairs used in the last training round */
protected int m_numActualPosPairs = 0;
protected int m_numActualNegPairs = 0;
// ===============
// Public methods.
// ===============
/**
* Generates a new InstanceMetric based on specified
* attributes. Has to initialize all fields of the metric with
* default values.
*
* @param numAttributes the number of attributes that the metric will work on
* @exception Exception if the distance metric has not been
* generated successfully. */
public abstract void buildInstanceMetric(int[] attrIdxs) throws Exception;
/**
* Create a new metric for operating on specified instances
* @param trainData instances that the metric will be trained on
* @param testData instances that the metric will be used on
*/
public abstract void trainInstanceMetric(Instances trainData, Instances testData) throws Exception;
/**
* Specifies a list of attributes which will be used by the metric
*
* @param attrs an array of attribute indices
*/
public void setAttrIdxs (int[] attrIdxs) {
m_attrIdxs = new int[attrIdxs.length];
System.arraycopy(attrIdxs, 0, m_attrIdxs, 0, attrIdxs.length);
}
/**
* Returns an array of attribute incece which will be used by the metric
*
* @return an array of attribute indices
*/
public int[] getAttrIndxs () {
return m_attrIdxs;
}
/**
* Specifies an interval of attributes which will be used by the metric
*
* @param begin_index beginning of attribute index interval
* @param end_index end of attribute index interval
*/
public void setAttrIdxs (int startIdx, int endIdx) {
m_attrIdxs = new int[endIdx - startIdx + 1];
for (int i = startIdx; i <= endIdx; i++)
m_attrIdxs[i - startIdx] = i;
}
/**
* Returns a distance value between two instances.
* @param instance1 First instance.
* @param instance2 Second instance.
* @exception Exception if distance could not be estimated.
*/
public abstract double distance(Instance instance1, Instance instance2) throws Exception;
/**
* Returns a similarity estimate between two instances.
* @param instance1 First instance.
* @param instance2 Second instance.
* @exception Exception if similarity could not be estimated.
*/
public abstract double similarity(Instance instance1, Instance instance2) throws Exception;
/**
* It is often the case that last attribute of the data is the class.
* This function takes instances, and returns an array of integers
* 0..(num_attributes-1 - 1) to exclude the class attribute
*
* @return array of integer indeces of attributes, excluding
* last one which is the class index
*/
public int[] getAttrIdxsWithoutLastClass(Instances instances) {
int [] attrIdxs;
attrIdxs = new int[instances.numAttributes() - 1];
for (int i = 0; i < attrIdxs.length; i++) {
attrIdxs[i] = i;
}
return attrIdxs;
}
/**
* This function takes instances, and returns an array of integers
* 0..(num_attributes-1)
*
* @return array of integer indeces of attributes
*/
public int[] getAttrIdxs(Instances instances) {
int [] attrIdxs;
attrIdxs = new int[instances.numAttributes()];
for (int i = 0; i < attrIdxs.length; i++) {
attrIdxs[i] = i;
}
return attrIdxs ;
}
/** Specify which attribute is the class attribute
* @param classAttrIdx the index of the class attribute
*/
public void setClassIndex(int classIndex) {
m_classIndex = classIndex;
}
/** Get the index of the attribute is the class attribute
* @returns the index of the class attribute
*/
public int getClassIndex(int classIndex) {
return m_classIndex;
}
/** Get the number of attributes that the metric uses
* @returns the number of attributes that the metric uses
*/
public int getNumAttributes() {
return m_attrIdxs.length;
}
/** The computation of a metric can be either based on distance, or on similarity
* @returns true if the underlying metric computes distance, false if similarity
*/
public abstract boolean isDistanceBased();
/** Return the actual number of positive training instances used in the last
* training round
* @return the true number of duplicate pairs used for training in the last round
*/
public int getNumActualPosPairs() {
return m_numActualPosPairs;
}
/** Return the actual number of negative training instances used in the last
* training round
* @return the true number of non-duplicate pairs used for training in the last round
*/
public int getNumActualNegPairs() {
return m_numActualNegPairs;
}
/**
* Creates a new instance of a metric given it's class name and
* (optional) arguments to pass to it's setOptions method. If the
* classifier implements OptionHandler and the options parameter is
* non-null, the classifier will have it's options set.
*
* @param metricName the fully qualified class name of the metric
* @param options an array of options suitable for passing to setOptions. May
* be null.
* @return the newly created metric ready for use.
* @exception Exception if the metric name is invalid, or the options
* supplied are not acceptable to the metric
*/
public static InstanceMetric forName(String metricName,
String [] options) throws Exception {
return (InstanceMetric)Utils.forName(InstanceMetric.class,
metricName,
options);
}
}