/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * SumInstanceMetric.java * Copyright (C) 2003 Mikhail Bilenko * */ package weka.deduping.metrics; import java.util.*; import java.text.SimpleDateFormat; import java.io.*; import weka.deduping.*; import weka.core.*; import weka.classifiers.DistributionClassifier; import weka.classifiers.functions.SMO; /** * SumInstanceMetric class simply adds * values returned by StringMetrics on individual fields * * @author Mikhail Bilenko (mbilenko@cs.utexas.edu) * @version $Revision: 1.5 $ */ public class SumInstanceMetric extends InstanceMetric implements OptionHandler, Serializable { /** A selector object that will create training sets */ PairwiseSelector m_selector = new PairwiseSelector(); /** An array of StringMetrics that are to be used on each attribute */ /*protected*/public StringMetric [] m_stringMetrics = null; protected StringMetric m_metric = new AffineMetric(); /** The number of positive pairs desired for training */ protected int m_numPosPairs = 500; protected int m_numNegPairs = 500; /** We may require objects to have a minimum number of * common tokens for them to be considered * for distance computation */ protected int m_minCommonTokens = 0; /** A default constructor */ public SumInstanceMetric() { } /** * Generates a new SumInstanceMetric based on specified * attributes. Has to initialize all fields of the metric with * default values. * * @param numAttributes the number of attributes that the metric will work on * @exception Exception if the distance metric has not been * generated successfully. */ public void buildInstanceMetric(int[] attrIdxs) throws Exception { // initialize the array of metrics for each attribute m_attrIdxs = attrIdxs; m_stringMetrics = new StringMetric[m_attrIdxs.length]; for (int i = 0; i < m_stringMetrics.length; i++) { m_stringMetrics[i] = (StringMetric) m_metric.clone(); } } /** * Create a new metric for operating on specified instances * @param data instances that the metric will be used on */ public void trainInstanceMetric(Instances trainData, Instances testData) throws Exception { m_selector.initSelector(trainData); // if we have data-dependent metrics (e.g. vector-space), build them with available data if (m_metric instanceof DataDependentStringMetric) { for (int i = 0; i < m_stringMetrics.length; i++) { ArrayList stringList = getStringList(trainData, testData, m_attrIdxs[i]); ((DataDependentStringMetric)m_stringMetrics[i]).buildMetric(stringList); } } // train all the learnable metrics if (m_metric instanceof LearnableStringMetric) { for (int i = 0; i < m_stringMetrics.length; i++) { ArrayList strPairList = m_selector.getStringPairList(trainData, m_attrIdxs[i], m_numPosPairs, m_numNegPairs, m_stringMetrics[i]); m_numActualPosPairs = m_numPosPairs; m_numActualNegPairs = m_numNegPairs; // begin: creating transductive pairs for metric learning for (int j = 0; j < 00; j++) { Random r = new Random(j); int idx1, idx2; idx1 = r.nextInt(testData.numInstances()); do { idx2 = r.nextInt(testData.numInstances()); } while (idx2 == idx1); StringPair pair = new StringPair(testData.instance(idx1).stringValue(m_attrIdxs[i]), testData.instance(idx2).stringValue(m_attrIdxs[i]), true, 1); strPairList.add(pair); } // end: creating transductive pairs for metric learning ((LearnableStringMetric)m_stringMetrics[i]).trainMetric(strPairList); } } System.out.println(getTimestamp() + " Created a SumInstanceMetric."); } /** An internal method for creating a list of strings for a particular attribute * from two sets of instances: trianing and test data */ protected ArrayList getStringList(Instances trainData, Instances testData, int attrIdx) { ArrayList stringList = new ArrayList(); // go through the training data and get all string values for that attribute if (trainData != null) { for (int i = 0; i < trainData.numInstances(); i++) { Instance instance = trainData.instance(i); String value = instance.stringValue(attrIdx); stringList.add(value); } } // go through the test data and get all string values for that attribute for (int i = 0; i < testData.numInstances(); i++) { Instance instance = testData.instance(i); String value = instance.stringValue(attrIdx); stringList.add(value); } return stringList; } /** * Returns distance between two instances without using the weights. * @param instance1 First instance. * @param instance2 Second instance. * @exception Exception if similarity could not be estimated. */ public double distance(Instance instance1, Instance instance2) throws Exception { // go through all metrics collecting the values of distances double distance = 0; for (int i = 0; i < m_stringMetrics.length; i++) { String str1 = instance1.stringValue(m_attrIdxs[i]); String str2 = instance2.stringValue(m_attrIdxs[i]); if (m_minCommonTokens > 0) { if (numCommonTokens(str1, str2) >= m_minCommonTokens) { double d = m_stringMetrics[i].distance(str1, str2); distance += d; } else { // there are too few common tokens; skip distance = Double.MAX_VALUE; } } else { // minCommonTokens = 0; we always compute distance double d = m_stringMetrics[i].distance(str1, str2); distance += d; } } return distance; } /** * Returns similarity between two instances without using the weights. * @param instance1 First instance. * @param instance2 Second instance. * @exception Exception if similarity could not be estimated. */ public double similarity(Instance instance1, Instance instance2) throws Exception { // go through all metrics collecting the values of distances double similarity = 0; for (int i = 0; i < m_stringMetrics.length; i++) { String str1 = instance1.stringValue(m_attrIdxs[i]); String str2 = instance2.stringValue(m_attrIdxs[i]); similarity += m_stringMetrics[i].similarity(str1, str2); } return similarity; } /** The computation of a metric can be either based on distance, or on similarity * @returns true if the underlying metric computes distance, false if similarity */ public boolean isDistanceBased() { return true; }; /** * Set the baseline metric * * @param metric the string metric to be used as the baseline on each string attribute */ public void setMetric (StringMetric metric) { m_metric = metric; } /** * Get the baseline metric * * @returns the baseline metric for each attribute */ public StringMetric getMetric () { return m_metric; } /** Set the pairwise selector for this metric * @param selector a new pairwise selector */ public void setSelector(PairwiseSelector selector) { m_selector = selector; } /** Get the pairwise selector for this metric * @param selector a new pairwise selector */ public PairwiseSelector getSelector() { return m_selector; } /** Set the number of same-class training pairs * @param numPosPairs the number of same-class training pairs to create for training */ public void setNumPosPairs(int numPosPairs) { m_numPosPairs = numPosPairs; } /** Get the number of same-class training pairs * @return the number of same-class training pairs to create for training */ public int getNumPosPairs() { return m_numPosPairs; } /** Set the number of different-class training pairs * @param numNegPairs the number of different-class training pairs to create for training */ public void setNumNegPairs(int numNegPairs) { m_numNegPairs = numNegPairs; } /** Get the number of different-class training pairs * @return the number of different-class training pairs to create for training */ public int getNumNegPairs() { return m_numNegPairs; } /** Set the minimum number of common tokens that is required from objects * to be considered for distance computation * @param minCommonTokens the minimum number of tokens in common that is required * from objects to be considered for distance computation */ public void setMinCommonTokens(int minCommonTokens) { m_minCommonTokens = minCommonTokens; } /** Get the minimum number of common tokens that is required from objects * to be considered for distance computation * @return the minimum number of tokens in common that is required * from objects to be considered for distance computation */ public int getMinCommonTokens() { return m_minCommonTokens; } /** * Gets a string containing current date and time. * * @return a string containing the date and time. */ protected static String getTimestamp() { return (new SimpleDateFormat("HH:mm:ss:")).format(new Date()); } /** A little helper to create a single String from an array of Strings * @param strings an array of strings * @returns a single concatenated string */ public static String concatStringArray(String[] strings) { StringBuffer buffer = new StringBuffer(); for (int i = 0; i < strings.length; i++) { buffer.append(strings[i]); buffer.append(" "); } return buffer.toString(); } /** return the number of tokens that two strings have in commmon */ public static int numCommonTokens(String s1, String s2) { String delimiters = " \t\n\r\f\'\"\\!@#$%^&*() "; HashSet set1 = new HashSet(); StringTokenizer tokenizer = new StringTokenizer(s1, delimiters); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); set1.add(token); } int numCommon = 0; tokenizer = new StringTokenizer(s2, delimiters); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if (set1.contains(token)) { numCommon++; } } return numCommon; } /** * Returns an enumeration describing the available options * * @return an enumeration of all the available options **/ public Enumeration listOptions() { Vector newVector = new Vector(0); return newVector.elements(); } /** * Parses a given list of options. * * Valid options are:<p> * * -M metric options <p> * StringMetric used <p> * * -C classifier options <p> * Classifier used <p> * * @param options the list of options as an array of strings * @exception Exception if an option is not supported * **/ public void setOptions(String[] options) throws Exception { String optionString; System.err.println("TODO! this method has not been implemented properly"); String metricString = Utils.getOption('M', options); if (metricString.length() != 0) { String[] metricSpec = Utils.splitOptions(metricString); String metricName = metricSpec[0]; metricSpec[0] = ""; System.out.println("Metric name: " + metricName + "\nMetric parameters: " + concatStringArray(metricSpec)); setMetric(StringMetric.forName(metricName, metricSpec)); } } /** * Gets the current settings of Greedy Agglomerative Clustering * * @return an array of strings suitable for passing to setOptions() */ public String [] getOptions() { String [] options = new String [60]; int current = 0; if (m_minCommonTokens > 0) { options[current++] = "-t"; options[current++] = "" + m_minCommonTokens; } if (m_selector instanceof OptionHandler) { String[] selectorOptions = ((OptionHandler)m_selector).getOptions(); for (int i = 0; i < selectorOptions.length; i++) { options[current++] = selectorOptions[i]; } } options[current++] = "-p"; options[current++] = "" + m_numPosPairs; options[current++] = "-n"; options[current++] = "" + m_numNegPairs; options[current++] = "-M"; options[current++] = Utils.removeSubstring(m_metric.getClass().getName(), "weka.deduping.metrics."); if (m_metric instanceof OptionHandler) { String[] metricOptions = ((OptionHandler)m_metric).getOptions(); for (int i = 0; i < metricOptions.length; i++) { options[current++] = metricOptions[i]; } } while (current < options.length) { options[current++] = ""; } return options; } }