/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * PairwiseSelector.java * Copyright (C) 2003 Mikhail Bilenko * */ package weka.deduping; import java.util.*; import java.io.Serializable; import weka.core.*; import weka.deduping.metrics.*; import weka.deduping.blocking.*; /** * PairwiseSelector class. Given a string metric and training data, * create a set of instance pairs that correspond to metric training data * * @author Mikhail Bilenko (mbilenko@cs.utexas.edu) * @version $Revision: 1.11 $ */ public class PairwiseSelector implements OptionHandler, Serializable { /** The set of instances used for training */ protected Instances m_instances = null; /** A hashmap where true object IDs are mapped to lists of strings of that object */ protected HashMap m_classInstanceMap = null; /** A list of classes, each element is the double value of the class attribute */ protected ArrayList m_classValueList = null; /** A list with all the positive examples as TrainingPair's */ protected ArrayList m_posPairList = null; /** A list with a sufficient pool of negative examples as TrainingPair's */ protected ArrayList m_negPairList = null; /** The number of possible same-class pairs */ protected int m_numPotentialPositives = 0; /** The number of possible different-class pairs */ protected int m_numPotentialNegatives = 0; /** Output debugging information */ protected boolean m_debug = false; /** The record pair selection method */ // positives public static final int POS_MODE_RANDOM_RECORDS = 1; public static final int POS_MODE_RANDOM_POSITIVES = 2; public static final int POS_MODE_STATIC_ACTIVE = 4; public static final Tag[] TAGS_POS_MODE = { new Tag(POS_MODE_RANDOM_RECORDS, "Random record pairs"), new Tag(POS_MODE_RANDOM_POSITIVES, "Random positive pairs"), new Tag(POS_MODE_STATIC_ACTIVE, "Static-active positive pairs"), }; protected int m_positivesMode = POS_MODE_RANDOM_POSITIVES; // should the rejected positives be spilled into the negatives set? protected boolean m_useRejectedPositives = true; // negatives public static final int NEG_MODE_RANDOM_RECORDS = 1; public static final int NEG_MODE_RANDOM_NEGATIVES = 2; public static final int NEG_MODE_IMPLICIT_NEGATIVES = 4; public static final Tag[] TAGS_NEG_MODE = { new Tag(NEG_MODE_RANDOM_RECORDS, "Random record pairs"), new Tag(NEG_MODE_RANDOM_NEGATIVES, "Random negative pairs"), new Tag(NEG_MODE_IMPLICIT_NEGATIVES, "Implicit negative pairs") }; protected int m_negativesMode = NEG_MODE_RANDOM_NEGATIVES; // should falsely selected implicit negatives be spilled into the positives set? protected boolean m_useFalseImplicitNegatives = true; /** String pair selection method */ public static final int STRING_PAIRS_RANDOM = 1; public static final int STRING_PAIRS_HARDEST = 2; public static final int STRING_PAIRS_EASIEST = 4; public static final Tag[] TAGS_STRING_PAIR_MODE = { new Tag(STRING_PAIRS_RANDOM, "Random string pairs"), new Tag(STRING_PAIRS_HARDEST, "Hardest string pairs"), new Tag(STRING_PAIRS_EASIEST, "Easiest string pairs") }; protected int m_posStringMode = STRING_PAIRS_RANDOM; protected int m_negStringMode = STRING_PAIRS_RANDOM; /** The maximum fraction of common tokens that instances can have to be included as implicit negatives */ protected double m_maxImplicitCommonTokenFraction = 0.2; /** We will need this reverse comparator class to traverse a TreeSet backwards */ public class ReverseComparator implements Comparator { public int compare(Object o1, Object o2) { Comparable c = (Comparable) o1; return -1 * c.compareTo(o2); } } /** A default constructor */ public PairwiseSelector() { } /** Initialize m_classInstanceMap and m_classValueList using a given set of instances * @param instances a set of instances from which pair examples will be selected */ public void initSelector(Instances instances) { m_instances = instances; m_classValueList = new ArrayList(); m_classInstanceMap = new HashMap(); m_numPotentialPositives = 0; m_numPotentialNegatives = 0; // go through all instances, hashing them into lists corresponding to each class Enumeration enum = instances.enumerateInstances(); while (enum.hasMoreElements()) { Instance instance = (Instance) enum.nextElement(); if (instance.classIsMissing()) { System.err.println("Instance " + instance + " has missing class!!!"); continue; } Double classValue = new Double(instance.classValue()); // if this class has been seen, add instance to the class's list if (m_classInstanceMap.containsKey(classValue)) { ArrayList classInstanceList = (ArrayList) m_classInstanceMap.get(classValue); classInstanceList.add(instance); } else { // create a new list of instances for a previously unseen class ArrayList classInstanceList = new ArrayList(); classInstanceList.add(instance); m_classInstanceMap.put(classValue, classInstanceList); m_classValueList.add(classValue); } } // get the number of potential positive pairs Iterator iterator = m_classInstanceMap.values().iterator(); while (iterator.hasNext()) { ArrayList classInstanceList = (ArrayList) iterator.next(); m_numPotentialPositives += classInstanceList.size() * (classInstanceList.size() - 1) / 2; } int numInstances = instances.numInstances(); m_numPotentialNegatives = numInstances * (numInstances - 1) / 2 - m_numPotentialPositives; createPosPairList(); createNegPairList(); System.out.println("m_numPotentialPositives=" + m_numPotentialPositives + "\tm_numPotentialNegatives=" + m_numPotentialNegatives); } /** Generate a training set of diffInstances. initSelector must have been called earlier * to initialize m_posPairList and m_negPairList. * @param attrIdxs indeces of fields that should be utilized * @param stringMetrics metrics that should be used on training pairs to generate diffInstances * @param numPosPairs the desired number of positive (same-class) diffInstance's * @param numNegPairs the desired number of negative (different-class) diffInstance's */ public Instances getInstances(int [] attrIdxs, StringMetric[][] stringMetrics, int numPosPairs, int numNegPairs) throws Exception { int numActualPositives = 0; int numActualNegatives = 0; HashMap checksumMap = new HashMap(); HashSet usedPairSet = new HashSet(); int numTrainingRecords = m_instances.numInstances(); double[] checksumCoeffs = new double[stringMetrics.length * stringMetrics[0].length]; if (m_posPairList == null || m_negPairList == null) { throw new Exception("Called PairwiseSelector.getInstances before initalization via initSelector!"); } /*** Create the Instances dataset ***/ // first, create all the numeric attributes FastVector attrInfoVector = new FastVector(); Random r = new Random(numPosPairs + numNegPairs); for (int i = 0; i < stringMetrics.length; i++) { for (int j = 0; j < stringMetrics[i].length; j++) { Attribute attr = new Attribute("" + i + "-" + j); attrInfoVector.addElement(attr); checksumCoeffs[i*stringMetrics[i].length + j] = r.nextDouble(); } } // create the class attribute FastVector classValues = new FastVector(); classValues.addElement("pos"); classValues.addElement("neg"); Attribute classAttr = new Attribute("class", classValues); attrInfoVector.addElement(classAttr); // create the dataset and set the class attribute Instances instances = new Instances("diffInstances", attrInfoVector, numPosPairs + numNegPairs); instances.setClass(classAttr); /*** Positives selection ***/ switch (m_positivesMode) { case POS_MODE_RANDOM_RECORDS: // just pick m_numPosPairs random record pairs int numMisfires = 0; for (int i = 0; i < numPosPairs && numMisfires < 1000; i++) { InstancePair pair = createRandomTrainInstancePair(usedPairSet, checksumMap); Instance trainInstance = createInstance(pair, attrIdxs, stringMetrics); if (trainInstance != null && isUniqueInstance(trainInstance, checksumMap, checksumCoeffs)) { instances.add(trainInstance); if (trainInstance.value(trainInstance.numValues()-1) == 0) { numActualPositives++; } else { numActualNegatives++; } } else { i--; numMisfires++; } } break; case POS_MODE_RANDOM_POSITIVES: // we are sampling from all same-class pairs in the training fold // randomize the indeces of positive examples and select the desired number numMisfires = 0; int [] posPairIdxs = randomSubset(m_numPotentialPositives, m_numPotentialPositives); for (int i = 0; i < posPairIdxs.length && numActualPositives < numPosPairs && numMisfires < 500; i++) { Instance posInstance = createInstance((InstancePair) m_posPairList.get(posPairIdxs[i]), attrIdxs, stringMetrics); if (posInstance != null && isUniqueInstance(posInstance, checksumMap, checksumCoeffs)) { instances.add(posInstance); numActualPositives++; } else { numMisfires++; } } break; case POS_MODE_STATIC_ACTIVE: Blocking blocker = new Blocking(); blocker.buildIndex(m_instances); InstancePair[] pairs = blocker.getMostSimilarPairs(numPosPairs*2); numMisfires = 0; for (int i = 0; (numActualPositives + numActualNegatives) < numPosPairs && i < pairs.length && pairs[i] != null && numMisfires < 500; i++) { Instance trainInstance = createInstance(pairs[i], attrIdxs, stringMetrics); if (trainInstance != null && isUniqueInstance(trainInstance, checksumMap, checksumCoeffs)) { if (pairs[i].positive == true) { instances.add(trainInstance); numActualPositives++; } else { if (m_useRejectedPositives) { instances.add(trainInstance); numActualNegatives++; } } } else { numMisfires++; } } System.out.println("After static-active:\t" + numActualPositives + " positives and " + numActualNegatives + " negatives"); break; default: throw new Exception("Unknown positive selection mode: " + m_positivesMode); } /*** Negatives selection ***/ switch (m_negativesMode) { case NEG_MODE_RANDOM_RECORDS: // just pick m_numNegPairs random record pairs int numMisfires = 0; for (int i = 0; i < numNegPairs && numMisfires < 1000; i++) { InstancePair pair = createRandomTrainInstancePair(usedPairSet, checksumMap); Instance trainInstance = createInstance(pair, attrIdxs, stringMetrics); if (trainInstance != null && isUniqueInstance(trainInstance, checksumMap, checksumCoeffs)) { instances.add(trainInstance); if (trainInstance.value(trainInstance.numValues()-1) == 0) { numActualPositives++; } else { numActualNegatives++; } } else { i--; numMisfires++; } } break; case NEG_MODE_RANDOM_NEGATIVES: // we are sampling from all different-class pairs in the training fold // randomize the indeces of negative examples and select the desired number numMisfires = 0; int numUniqueNegatives = 0; int [] negPairIdxs = randomSubset(m_numPotentialNegatives, m_numPotentialNegatives); for (int i = 0; i < negPairIdxs.length && numUniqueNegatives < numNegPairs && numMisfires < 1000; i++) { Instance negInstance = createInstance((InstancePair) m_negPairList.get(negPairIdxs[i]), attrIdxs, stringMetrics); if (negInstance != null && isUniqueInstance(negInstance, checksumMap, checksumCoeffs)) { instances.add(negInstance); numActualNegatives++; numUniqueNegatives++; } else { numMisfires++; } } break; case NEG_MODE_IMPLICIT_NEGATIVES: numMisfires = 0; for (int i = 0; i < numNegPairs && numMisfires < 30000; i++) { InstancePair pair = createRandomTrainInstancePair(usedPairSet, checksumMap); Instance trainInstance = createInstance(pair, attrIdxs, stringMetrics); if (trainInstance != null && isUniqueInstance(trainInstance, checksumMap, checksumCoeffs)) { // calculate the fraction of common tokens StringBuffer s1 = new StringBuffer(); StringBuffer s2 = new StringBuffer(); for (int j = 0; j < pair.instance1.numAttributes(); j++) { s1.append(pair.instance1.stringValue(j)); s1.append(" "); s2.append(pair.instance2.stringValue(j)); s2.append(" "); } if (fractionCommonTokens(s1.toString(), s2.toString()) <= m_maxImplicitCommonTokenFraction) { // check if the negative is bogus if (trainInstance.value(trainInstance.numValues()-1) == 0) { System.out.print("False negative!\n\t" + pair.instance1 + "\n\t" + pair.instance2); if (m_useFalseImplicitNegatives) { numActualPositives++; } else { trainInstance.setValue(trainInstance.numValues()-1, 1); numActualNegatives++; } instances.add(trainInstance); } else { // true implicit negative numActualNegatives++; instances.add(trainInstance); } } else { // try an extra pair if this one didn't work out due to too many positive pairs numMisfires++; i--; } } else { // try an extra pair if this one was null or not unique numMisfires++; i--; } } break; } System.out.println(); System.out.println("POSITIVES: requested=" + numPosPairs + "\tpossible=" + m_numPotentialPositives + "\tactual=" + numActualPositives); System.out.println("NEGATIVES: requested=" + numNegPairs + "\tpossible=" + m_numPotentialNegatives + "\tactual=" + numActualNegatives); return instances; } protected InstancePair createRandomTrainInstancePair(HashSet usedPairSet, HashMap checksumMap) { int numTrainingRecords = m_instances.numInstances(); int idx1, idx2; Integer pairCode, pairCodeOrdered; int numTries = 0; int maxNumTries = 1000; InstancePair pair = null; Random r = new Random(usedPairSet.size() + checksumMap.size()); // select a random pair of instances that has not been // seen before or until we exhaust all possible pairs do { idx1 = r.nextInt(numTrainingRecords); idx2 = r.nextInt(numTrainingRecords); while (idx2 == idx1) { // prevent selecting the same instance twice idx2 = r.nextInt(numTrainingRecords); } pairCode = new Integer(idx1 * numTrainingRecords + idx2); pairCodeOrdered = new Integer(idx2 * numTrainingRecords + idx1); numTries++; } while ((usedPairSet.contains(pairCode) || usedPairSet.contains(pairCodeOrdered)) && numTries < maxNumTries); if (numTries < maxNumTries) { // create the training instance and add it usedPairSet.add(pairCode); usedPairSet.add(pairCodeOrdered); Instance instance1 = m_instances.instance(idx1); Instance instance2 = m_instances.instance(idx2); boolean positive = (instance1.classValue() == instance2.classValue());; pair = new InstancePair(instance1, instance2, positive, 0); } return pair; } /** * Create a nonsparse instance with features corresponding to the * metric values between used fields of the two given instances * @param instancePair a pair of instances that is used for creating the new diffInstance * @param attrIdxs indeces of fields that should be utilized * @param metrics the string metrics that are used to create the training instances * @return a newly created diffInstance, or null if all diff-values are 0 */ protected Instance createInstance (InstancePair pair, int[] attrIdxs, StringMetric[][] metrics ) throws Exception { int numAttributes = metrics.length * metrics[0].length + 1; int numNonNegativeValues = 0; int numValues = 0; double[] values = new double[numAttributes]; for (int i = 0; i < attrIdxs.length; i++) { String val1 = pair.instance1.stringValue(attrIdxs[i]); String val2 = pair.instance2.stringValue(attrIdxs[i]); for (int j = 0; j < metrics.length; j++) { if (metrics[j][i].isDistanceBased()) { values[numValues] = metrics[j][i].distance(val1, val2); } else { values[numValues] = metrics[j][i].similarity(val1, val2); } if (values[numValues] != 0) { numNonNegativeValues++; } numValues++; } } if (pair.positive) { values[numAttributes-1] = 0; } else { values[numAttributes-1] = 1; } // if there were non-zero attributes, return the instance, otherwise return null if (numNonNegativeValues > 0) { return new Instance(1.0, values); } else { return null; } } /** Check whether an instance is unique * @param instance instance to be checked * @param checksumMap a map where checksum values are mapped to lists of instances * @param sumCoeffs coefficients used for computing the checksum * @return true if the instance is unique, false otherwise */ protected boolean isUniqueInstance(Instance instance, HashMap checksumMap, double[] checksumCoeffs) { double checksum = 0; // compute the checksum and round off to overcome machine precision errors for (int i = 0; i < instance.numValues()-1; i++) { checksum += checksumCoeffs[i] * instance.value(i); } checksum = (float) checksum; // if this checksum was encountered before, get a list of instances // that have this checksum, and check if any of them are dupes of this one if (checksumMap.containsKey(new Double(checksum))) { ArrayList checksumList = (ArrayList) checksumMap.get(new Double(checksum)); boolean unique = true; for (int k = 0; k < checksumList.size() && unique; k++) { Instance nextDiffInstance = (Instance) checksumList.get(k); unique = false; for (int l = 0; l < nextDiffInstance.numValues()-1 && !unique; l++) { if (((float)nextDiffInstance.value(l)) != ((float)instance.value(l))) { unique = true; } } if (unique == false) { return false; } } checksumList.add(instance); return true; // no dupes were found among instances with the same checksum } else { // this checksum has not been encountered before ArrayList checksumList = new ArrayList(); checksumMap.put(new Double(checksum), checksumList); checksumList.add(instance); return true; } } /** * Provide an array of string pairs metric using given training instances * * @param metric the metric to train * @param instances data to train the metric on * @exception Exception if training has gone bad. * @return a list of StringPair's that is training data for a particular field */ public ArrayList getStringPairList(Instances instances, int attrIdx, int numPosPairs, int numNegPairs, StringMetric metric) throws Exception { System.out.println("Selecting strings out of " + instances.numInstances() + " instances, first is \n" + instances.instance(0)); ArrayList pairList = new ArrayList(); TreeSet posPairSet = null; TreeSet negPairSet = null; double [] posPairDistances = null; double [] negPairDistances = null; Iterator iterator = null; int numPossiblePosStrPairs = 0, numPossibleNegStrPairs = 0; int numActualPositives = 0, numActualNegatives = 0; // SELECT POSITIVE PAIRS switch (m_posStringMode) { case STRING_PAIRS_EASIEST: posPairSet = new TreeSet(new ReverseComparator()); posPairDistances = populatePosStrPairSet(metric, posPairSet, attrIdx); numPossiblePosStrPairs = posPairSet.size(); // select numPositives iterator = posPairSet.iterator(); for (int i = 0; iterator.hasNext() && i < m_numPotentialPositives && i < numPosPairs; i++) { StringPair posPair = (StringPair) iterator.next(); pairList.add(posPair); } break; case STRING_PAIRS_HARDEST: posPairSet = new TreeSet(); posPairDistances = populatePosStrPairSet(metric, posPairSet, attrIdx); numPossiblePosStrPairs = posPairSet.size(); // select numPositives examples iterator = posPairSet.iterator(); for (int i = 0; iterator.hasNext() && i < m_numPotentialPositives && i < numPosPairs; i++) { StringPair posPair = (StringPair) iterator.next(); pairList.add(posPair); } break; case STRING_PAIRS_RANDOM: // Get string pairs for a given attribute ArrayList strPairList = new ArrayList(); for (int i = 0; i < m_posPairList.size(); i++) { InstancePair pair = (InstancePair) m_posPairList.get(i); String str1 = pair.instance1.stringValue(attrIdx); String str2 = pair.instance2.stringValue(attrIdx); if (!str1.equals(str2) && haveCommonTokens(str1, str2)) { StringPair strPair = new StringPair(str1, str2, true, 0); strPairList.add(strPair); } else { System.out.println("Equal strings, or no common tokens - NOT adding: " + str1 + "\t" + str2); } } numPossiblePosStrPairs = strPairList.size(); // if we have fewer pairs available than requested, return all the ones that were created if (strPairList.size() <= numPosPairs) { System.out.println("INSUFFICIENT available POSITIVE examples, using all " + strPairList.size()); pairList = strPairList; } else { // if we have more than enough potential pairs, sample randomly with replacement int[] indexes = randomSubset(numPosPairs, strPairList.size()); System.out.println("SUFFICIENT available POSITIVE examples, randomly selected " + indexes.length + " of " + strPairList.size()); for (int i = 0; i < indexes.length; i++) { pairList.add(strPairList.get(indexes[i])); } } // for (int i =0 ; i < strPairList.size(); i++) { // StringPair pair = (StringPair) strPairList.get(i); // System.out.println(pair.str1 + "\t\t\t" + pair.str2); // } break; default: throw new Exception("Unknown method for selecting positive pairs: " + m_posStringMode); } numActualPositives = pairList.size(); // we don't need negative string pairs for AffineProbMetric if (!metric.getClass().getName().equals("weka.deduping.metrics.AffineProbMetric")) { // SELECT NEGATIVE PAIRS unless this is AffineProbMetric - it doesn't need negatives switch (m_negStringMode) { case STRING_PAIRS_EASIEST: // Create a map with *all* negatives negPairSet = new TreeSet(); negPairDistances = populateNegStrPairSet(metric, negPairSet, attrIdx); numPossibleNegStrPairs = negPairSet.size(); iterator = negPairSet.iterator(); for (int i = 0; iterator.hasNext() && i < m_numPotentialNegatives && i < numNegPairs; i++) { StringPair negPair = (StringPair) iterator.next(); pairList.add(negPair); System.out.println("EASY: " + negPair.value + "\n\t" + negPair.str1 + "\n\t" + negPair.str2); } break; case STRING_PAIRS_HARDEST: negPairSet = new TreeSet(new ReverseComparator()); negPairDistances = populateNegStrPairSet(metric, negPairSet, attrIdx); numPossibleNegStrPairs = negPairSet.size(); // We will hash each pair of classes that was used so that we don't end up with // too many pairs from the same combination of two classes HashSet usedComboSet = new HashSet(); iterator = negPairSet.iterator(); for (int i = 0; iterator.hasNext() && i < m_numPotentialNegatives && i < numNegPairs; i++) { StringPair negPair = (StringPair) iterator.next(); Double class1class2HashValue = new Double(negPair.class1 * 100000 + negPair.class2); if (!usedComboSet.contains(class1class2HashValue)) { // kludge - comment out for cora1 pairList.add(negPair); // System.out.println("HARD: " + negPair.value + "\n\t" + negPair.str1 + "\n\t" + negPair.str2); usedComboSet.add(class1class2HashValue); // add reverse combo (or allow two per class if commented out // usedComboSet.add(new Double(negPair.class2 * 1000 + negPair.class1)); <- reverse combo } } break; case STRING_PAIRS_RANDOM: // Get string pairs for a given attribute ArrayList strPairList = new ArrayList(); for (int i = 0; i < m_negPairList.size(); i++) { InstancePair pair = (InstancePair) m_negPairList.get(i); String str1 = pair.instance1.stringValue(attrIdx); String str2 = pair.instance2.stringValue(attrIdx); if (!str1.equals(str2)) { StringPair strPair = new StringPair(str1, str2, false, 0); strPairList.add(strPair); } } numPossibleNegStrPairs = strPairList.size(); // if we have fewer pairs available than requested, return all the ones that were created if (strPairList.size() <= numNegPairs) { System.out.println("INSUFFICIENT available NEGATIVE examples, using all " + strPairList.size()); pairList.addAll(strPairList); } else { // if we have enough potential pairs, randomly sample with replacement int[] indexes = randomSubset(numNegPairs, strPairList.size()); System.out.println("SUFFICIENT available NEGATIVE examples, randomly selected " + indexes.length + " of " + strPairList.size()); for (int i = 0; i < indexes.length; i++) { pairList.add(strPairList.get(indexes[i])); } } break; default: throw new Exception("Unknown method for selecting negative pairs: " + m_negStringMode); } } numActualNegatives = pairList.size() - numActualPositives; System.out.println(); System.out.println("**POSITIVES: requested=" + numPosPairs + "\tpossible=" + numPossiblePosStrPairs + "\tactual=" + numActualPositives); System.out.println("**NEGATIVES: requested=" + numNegPairs + "\tpossible=" + numPossibleNegStrPairs + "\tactual=" + numActualNegatives); return pairList; } /** Add a pair to a TreeSet so that there are no collisions, and no values are erased * @param set a set to which a new pair should be added * @param pair a new pair of strings that is to be added; value * fields holds the distance between the strings * @return the unique value of the distance (possibly perturbed) * with which the pair was added */ protected double addUniquePair(TreeSet set, StringPair pair) { Random random = new Random(); double epsilon = 0.0000001; int counter = 0; while (set.contains(pair)) { double perturbation; if (pair.value == 0) { perturbation = Double.MIN_VALUE * random.nextInt(m_numPotentialPositives); } else { perturbation = pair.value * epsilon * ((random.nextDouble() > 0.5) ? 1 : -1); } pair.value += perturbation; counter++; if (counter % 10 == 0) { // increase perturbations if "nearby" values have been exhausted epsilon *= 10; } } set.add(pair); return pair.value; } /** Populate a provided treeset with all positive StringPair's * @param metric a metric that will be used to calculate distance * @param pairSet an empty TreeSet that will be populated * @param attrIdx the index of the attribute for which positive * string pairs are being accumulated * @return an array with distance values of the created pairs */ protected double[] populatePosStrPairSet(StringMetric metric, TreeSet strPairSet, int attrIdx) throws Exception { double [] posPairDistances = new double[m_numPotentialPositives]; Arrays.fill(posPairDistances, Double.MIN_VALUE); int posCounter = 0; for (int i = 0; i < m_posPairList.size(); i++) { InstancePair pair = (InstancePair) m_posPairList.get(i); String str1 = pair.instance1.stringValue(attrIdx); String str2 = pair.instance2.stringValue(attrIdx); // unless the two fields are exact duplicates, create a new pair if (!str1.equals(str2)) { StringPair strPair = new StringPair(str1, str2, true, metric.similarity(str1, str2)); posPairDistances[posCounter++] = addUniquePair(strPairSet, strPair); } } return posPairDistances; } /** Populate a provided treeset with a sufficient population of negative StringPair's * @param metric a metric that will be used to calculate distance between strings * @param pairSet an empty TreeSet that will be populated * @param attrIdx the index of the attribute for which positive * string pairs are being accumulated * @return an array with distance values of the created pairs */ protected double[] populateNegStrPairSet(StringMetric metric, TreeSet strPairSet, int attrIdx) throws Exception { // Create a map with *all* positives double [] negPairDistances = new double[m_numPotentialNegatives]; Arrays.fill(negPairDistances, Double.MIN_VALUE); int negCounter = 0; int[] negPairIdxs; // get a random sample if we have too many possible negatives TODO - are we limiting ourselves here??? negPairIdxs = randomSubset(20000, m_numPotentialNegatives); for (int i = 0; i < negPairIdxs.length; i++) { InstancePair pair = (InstancePair) m_negPairList.get(negPairIdxs[i]); String str1 = pair.instance1.stringValue(attrIdx); String str2 = pair.instance2.stringValue(attrIdx); // unless the two fields are exact duplicates, create a new pair if (!str1.equals(str2)) { StringPair strPair = new StringPair(str1, str2, false, metric.similarity(str1, str2)); strPair.class1 = pair.instance1.classValue(); strPair.class2 = pair.instance2.classValue(); negPairDistances[negCounter++] = addUniquePair(strPairSet, strPair); } } return negPairDistances; } /** Populate m_posPairList with all positive InstancePair's */ protected void createPosPairList() { // go through lists of instances for each class and create a list of *all* positive pairs m_posPairList = new ArrayList(); Iterator iterator = m_classInstanceMap.values().iterator(); while (iterator.hasNext()) { ArrayList instanceList = (ArrayList) iterator.next(); // create all same-class pairs for every true object for (int i = 0; i < instanceList.size(); i++) { Instance instance1 = (Instance) instanceList.get(i); for (int j = i+1; j < instanceList.size(); j++) { Instance instance2 = (Instance) instanceList.get(j); InstancePair pair = new InstancePair(instance1, instance2, true, 0); m_posPairList.add(pair); } } } } /** Populate m_negPairList with negative InstancePair's */ protected void createNegPairList() { m_negPairList = new ArrayList(); // go through lists of instances for each class for (int i = 0; i < m_classValueList.size(); i++) { ArrayList instanceList1 = (ArrayList) m_classInstanceMap.get(m_classValueList.get(i)); for (int j = 0; j < instanceList1.size(); j++) { Instance instance1 = (Instance) instanceList1.get(j); // create all pairs from other clusters with this str for (int k = i+1; k < m_classValueList.size(); k++) { ArrayList instanceList2 = (ArrayList) m_classInstanceMap.get(m_classValueList.get(k)); for (int l = 0; l < instanceList2.size(); l++) { Instance instance2 = (Instance) instanceList2.get(l); InstancePair pair = new InstancePair(instance1, instance2, false, 0); m_negPairList.add(pair); } } } } } /** Given a set, return a TreeSet whose items are accessed in descending order * @param set any set containing Comparable objects * @return a new ordered set with those objects in reverse order */ public TreeSet reverseCopy(Set set) { TreeSet reverseSet = new TreeSet(new ReverseComparator()); reverseSet.addAll(set); return reverseSet; } /** Set the selection mode for positives * @param mode selection mode for positive examples */ public void setPositivesMode(SelectedTag mode) { if (mode.getTags() == TAGS_POS_MODE) { m_positivesMode = mode.getSelectedTag().getID(); } } /** * return the selection mode for positives * @return one of the selection modes */ public SelectedTag getPositivesMode() { return new SelectedTag(m_positivesMode, TAGS_POS_MODE); } /** Set the selection mode for negatives * @param mode selection mode for negative examples */ public void setNegativesMode(SelectedTag mode) { if (mode.getTags() == TAGS_NEG_MODE) { m_negativesMode = mode.getSelectedTag().getID(); } } /** * return the selection mode for negatives * @return one of the selection modes */ public SelectedTag getNegativesMode() { return new SelectedTag(m_negativesMode, TAGS_NEG_MODE); } /** * Set the maximum fraction of common tokens that instances can have to * be included as implicit negatives * @param maxImplicitCommonTokenFraction */ public void setMaxImplicitCommonTokenFraction(double maxImplicitCommonTokenFraction) { m_maxImplicitCommonTokenFraction = maxImplicitCommonTokenFraction; } /** * Get the maximum fraction of common tokens that instances can have to * be included as implicit negatives * @return the fraction */ public double getMaxImplicitCommonTokenFraction() { return m_maxImplicitCommonTokenFraction; } /** Turn using rejected positives as negatives on/off * @param useRejectedPositives if true, false positives that were picked during the * static-active selection will be added to the negatives set */ public void setUseRejectedPositives(boolean useRejectedPositives) { m_useRejectedPositives = useRejectedPositives; } /** Check whether using rejected positives as negatives is on or off * @return returns true if false positives that were picked during * the static-active selection are added to the negatives set */ public boolean getUseRejectedPositives() { return m_useRejectedPositives; } /** Turn using false implicit negatives on/off * @param useFalseImplicitNegatives if true, false implicit negatives will be added to positives */ public void setUseFalseImplicitNegatives(boolean useFalseImplicitNegatives) { m_useFalseImplicitNegatives = useFalseImplicitNegatives; } /** Check whether using false implicit negatives is on/off * @return true if false implicit negatives are added to positives */ public boolean getUseFalseImplicitNegatives() { return m_useFalseImplicitNegatives; } /** Set the selection mode for positive string examples * @param mode selection mode for positive string examples */ public void setPosStringMode(SelectedTag mode) { if (mode.getTags() == TAGS_STRING_PAIR_MODE) { m_posStringMode = mode.getSelectedTag().getID(); } } /** * return the selection mode for positive string examples * @return one of the selection modes for positive string examples */ public SelectedTag getPosStringMode() { return new SelectedTag(m_posStringMode, TAGS_STRING_PAIR_MODE); } /** Set the selection mode for negative string examples * @param mode selection mode for negative string examples */ public void setNegStringMode(SelectedTag mode) { if (mode.getTags() == TAGS_STRING_PAIR_MODE) { m_negStringMode = mode.getSelectedTag().getID(); } } /** * return the selection mode for negative string examples * @return one of the selection modes for negative string examples */ public SelectedTag getNegStringMode() { return new SelectedTag(m_negStringMode, TAGS_STRING_PAIR_MODE); } /** Turn debugging output on/off * @param debug if true, debugging info will be printed */ public void setDebug(boolean debug) { m_debug = debug; } /** See whether debugging output is on/off * @returns if true, debugging info will be printed */ public boolean getDebug() { return m_debug; } /** * get an array random indeces out of n possible values. * if the number of requested indeces is larger then maxIdx, returns * maxIdx permuted values * @param maxIdx - the maximum index of the set * @param numIdxs number of indexes to return * @return an array of indexes */ public static int[] randomSubset(int numIdxs, int maxIdx) { Random r = new Random(maxIdx + numIdxs); int[] indeces = new int[maxIdx]; for (int i = 0; i < maxIdx; i++) { indeces[i] = i; } // permute the indeces randomly for (int i = 0; i < indeces.length; i++) { int idx = r.nextInt (maxIdx); int temp = indeces[idx]; indeces[idx] = indeces[i]; indeces[i] = temp; } int []returnIdxs = new int[Math.min(numIdxs,maxIdx)]; for (int i = 0; i < returnIdxs.length; i++) { returnIdxs[i] = indeces[i]; } return returnIdxs; } /** return true if two strings have commmon tokens */ public static boolean haveCommonTokens(String s1, String s2) { String delimiters = " \t\n\r\f\'\"\\!@#$%^&*()_-+={}<>,.;:|[]{}/*~`"; HashSet tokenSet1 = new HashSet(); StringTokenizer tokenizer = new StringTokenizer(s1, delimiters); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); tokenSet1.add(token); } int count = 0; tokenizer = new StringTokenizer(s2, delimiters); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if (tokenSet1.contains(token)) { count++; if (count > 0) { return true; } } } return false; } /** return the number of commmon tokens that two strings have * @param s1 string 1 * @param s2 string 2 * @return the number of common tokens the strings have */ public static double fractionCommonTokens(String s1, String s2) { String delimiters = " \t\n\r\f\'\"\\!@#$%^&*()_-+={}<>,.;:|[]{}/*~`"; HashSet tokenSet1 = new HashSet(); int commonTokens = 0; int totalTokens = 0; StringTokenizer tokenizer = new StringTokenizer(s1, delimiters); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); tokenSet1.add(token); totalTokens++; } tokenizer = new StringTokenizer(s2, delimiters); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if (tokenSet1.contains(token)) { commonTokens++; } totalTokens++; } return ((commonTokens + 0.0)/totalTokens); } /** * Gets the current settings of WeightedDotP. * * @return an array of strings suitable for passing to setOptions() */ public String [] getOptions() { String [] options = new String [10]; int current = 0; switch(m_positivesMode) { case POS_MODE_RANDOM_RECORDS: options[current++] = "-Pr"; break; case POS_MODE_RANDOM_POSITIVES: options[current++] = "-Pp"; break; case POS_MODE_STATIC_ACTIVE: if (m_useRejectedPositives) { options[current++] = "-PsN"; } else { options[current++] = "-Ps"; } break; } switch(m_negativesMode) { case NEG_MODE_RANDOM_RECORDS: options[current++] = "-Nr"; break; case NEG_MODE_RANDOM_NEGATIVES: options[current++] = "-Nn"; break; case NEG_MODE_IMPLICIT_NEGATIVES: if (m_useFalseImplicitNegatives) { options[current++] = "-NiP" + m_maxImplicitCommonTokenFraction; } else { options[current++] = "-Ni" + m_maxImplicitCommonTokenFraction; } break; } switch(m_posStringMode) { case STRING_PAIRS_RANDOM: options[current++] = "-SPr"; break; case STRING_PAIRS_HARDEST: options[current++] = "-SPh"; break; case STRING_PAIRS_EASIEST: options[current++] = "-SPe"; break; } switch(m_negStringMode) { case STRING_PAIRS_RANDOM: options[current++] = "-SNr"; break; case STRING_PAIRS_HARDEST: options[current++] = "-SNh"; break; case STRING_PAIRS_EASIEST: options[current++] = "-SNe"; break; } while (current < options.length) { options[current++] = ""; } return options; } /** * Parses a given list of options. Valid options are:<p> * */ public void setOptions(String[] options) throws Exception { } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector newVector = new Vector(0); return newVector.elements(); } }