/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * InstancePair.java * Copyright (C) 2002 Sugato Basu * */ package weka.clusterers; import java.util.*; import weka.core.Instance; import weka.core.Instances; /** Class for handling a pair of instances, in terms of indices of instances in an Instances set */ public class InstancePair implements Comparable { /** first instance index */ public int first; /** second instance index, always <= first */ public int second; /** MUST_LINK, CANNOT_LINK or DONT_CARE_LINK */ public int linkType; /** cost of violating constraint */ public double cost; /** score from active learning algorithm */ /** ----- DEPRECATED: ACTIVE SCORE NO LONGER USED IN PCKMEANS!!!! -----*/ public double activeScore; /** must-link */ public final static int MUST_LINK = 29; /** cannot-link */ public final static int CANNOT_LINK = 31; /** don't care */ public final static int DONT_CARE_LINK = 37; public static boolean m_isClassAttributeString = false; /** constructor */ public InstancePair() { } /** constructor */ public InstancePair(int a, int b) { first = a; second = b; } /** constructor */ public InstancePair(int a, int b, int l) { first = a; second = b; linkType = l; } /** constructor */ public InstancePair(int a, int b, int l, double c) { first = a; second = b; linkType = l; cost = c; } /** Compare function * @return 0 if equal, -1 if this.activeScore > a.activeScore, +1 else * Note: Reverse of conventional compareTo, to force sort in descending order */ public int compareTo (Object a) { if (a instanceof InstancePair) { return compareTo((InstancePair)a); } return 0; } /** Compare function * @return 0 if equal, -1 if this.activeScore > a.activeScore, +1 else * Note: Reverse of conventional compareTo, to force sort in descending order */ public int compareTo (InstancePair a) { if (this.activeScore == a.activeScore) return 0; else if (this.activeScore > a.activeScore) return -1; return +1; } /** Equals function * @return true if same, false else */ public boolean equals (Object a) { if (a instanceof InstancePair) { InstancePair b = (InstancePair) a; if (this.first==b.first && this.second==b.second && this.linkType==b.linkType) { return true; } else { return false; } } return super.equals(a); } /** hashCode */ public int hashCode() { return first*second*linkType; } /** Finds whether index is in pair */ boolean contains (int num) { return (first == num || second == num); } /** Returns an arraylist of random (both positive and negative) pair objects created from the input * @param instances list of instances * @param size number of pairs to return * @return arraylist of pairs */ public static ArrayList getPairs(Instances instances, int size) { return getPairs(instances, size, -1); } /** Returns an arraylist of pair objects created from the input set of instances * @param instances list of instances * @param size number of pairs to return * @param fractionMustLinks proportion of Must-Links; if -1 - sample randomly * @return arraylist of pairs */ public static ArrayList getPairs(Instances instances, int size, double fractionMustLinks) { ArrayList pairs = new ArrayList(size); int num=0; Random rand = new Random(42); m_isClassAttributeString = instances.instance(0).classAttribute().isString(); if (fractionMustLinks != -1) { int numMustLinks = (int) (fractionMustLinks * size); int numCannotLinks = size - numMustLinks; int numClasses = instances.numClasses(); // stratify instances into lists for each class HashMap classListMap = new HashMap(); for (int i = 0; i < instances.numInstances(); i++) { Double classValue = new Double(instances.instance(i).classValue()); if (classListMap.containsKey(classValue)) { ArrayList classList = (ArrayList) classListMap.get(classValue); classList.add(new Integer(i)); } else { // previously unseen class ArrayList classList = new ArrayList(); classList.add(new Integer(i)); classListMap.put(classValue, classList); } } // select must-links first while (num < numMustLinks) { int first = rand.nextInt(instances.numInstances()); int second = 0; if (!m_isClassAttributeString) { Double classValue = new Double(instances.instance(first).classValue()); ArrayList classList = (ArrayList) classListMap.get(classValue); // skip classes with a single instance if (classList.size() < 2) { continue; } // select a random instance from the same class int idx = rand.nextInt(classList.size()); second = ((Integer) classList.get(idx)).intValue(); } else { // phylo profile case second = rand.nextInt(instances.numInstances()); while (second == first) { second = rand.nextInt(instances.numInstances()); } } if (first > second) { // flip if out of order int i = first; first = second; second = i; } Instance firstInstance = instances.instance(first); Instance secondInstance = instances.instance(second); if (m_isClassAttributeString) { // for handling string valued class attributes corr. to // multi-class phylogenetic profiles double jaccardSim = jaccardSimilarityOfClassStrings(firstInstance, secondInstance); int linkType = InstancePair.DONT_CARE_LINK; double cost = 0; if (jaccardSim > 0) { linkType = InstancePair.MUST_LINK; cost = jaccardSim; } else if (jaccardSim == 0) { linkType = InstancePair.CANNOT_LINK; cost = 1.0; } else { // jaccardSim < 0 => don't care link linkType = InstancePair.DONT_CARE_LINK; cost = -1.0; } InstancePair pair = new InstancePair(first, second, linkType, cost); if (first!=second && !pairs.contains(pair) && linkType == InstancePair.MUST_LINK && cost < 1.0) { // to filter homologs pairs.add(pair); // System.out.println("Instances are:\n" + firstInstance + "\n" + secondInstance); // System.out.println("Jaccard sim = " + cost); // System.out.println(num + "th pair is: " + pair); num++; } } else { int linkType = (instances.instance(first).classValue() == instances.instance(second).classValue())? InstancePair.MUST_LINK:InstancePair.CANNOT_LINK; InstancePair pair = new InstancePair(first, second, linkType); if (first != second && !pairs.contains(pair) && linkType == InstancePair.MUST_LINK) { pairs.add(pair); num++; } } } // now add cannot-links - NB: for now not dealing with string attributes; TODO: handle m_isClassAttributeString num = 0; while (num < numCannotLinks) { // we just sample randomly - arguably less time-efficient, but we don't need to // create another hash this way. int first = rand.nextInt(instances.numInstances()); int second = rand.nextInt(instances.numInstances()); while (instances.instance(first).classValue() == instances.instance(second).classValue()) { second = rand.nextInt(instances.numInstances()); } if (first > second) { // flip if out of order int i = first; first = second; second = i; } InstancePair pair = new InstancePair(first, second, InstancePair.CANNOT_LINK); if (!pairs.contains(pair)) { pairs.add(pair); num++; } } System.out.println("Created " + numMustLinks + " must-links and " + numCannotLinks + " cannot-links."); } else { // just collect the requested number of instance pairs by sampling randomly while (num < size) { int i = rand.nextInt(instances.numInstances()); int j = rand.nextInt(instances.numInstances()); int first = (i<j)? i:j; int second = (i>=j)? i:j; Instance firstInstance = instances.instance(first); Instance secondInstance = instances.instance(second); if (firstInstance.classAttribute().isString()) { // for handling string valued class attributes corr. to // multi-class phylogenetic profiles double jaccardSim = jaccardSimilarityOfClassStrings(firstInstance, secondInstance); int linkType = InstancePair.DONT_CARE_LINK; double cost = 0; if (jaccardSim > 0) { linkType = InstancePair.MUST_LINK; cost = jaccardSim; } else if (jaccardSim == 0) { linkType = InstancePair.CANNOT_LINK; cost = 1.0; } else { // jaccardSim < 0 => don't care link linkType = InstancePair.DONT_CARE_LINK; cost = -1.0; } InstancePair pair = new InstancePair(first, second, linkType, cost); if (first!=second && !pairs.contains(pair) && linkType != InstancePair.DONT_CARE_LINK) { pairs.add(pair); // System.out.println(num + "th pair is: " + pair); num++; } } else { int linkType = (instances.instance(first).classValue() == instances.instance(second).classValue())? InstancePair.MUST_LINK:InstancePair.CANNOT_LINK; InstancePair pair = new InstancePair(first, second, linkType); if (first!=second && !pairs.contains(pair)) { pairs.add(pair); // System.out.println(num + "th pair is: " + pair); num++; } } } } return pairs; } public static double jaccardSimilarityOfClassStrings(Instance a, Instance b) { String s1 = a.classAttribute().value((int) a.classValue()); String s2 = b.classAttribute().value((int) b.classValue()); // System.out.println("Trying out " + s1 + " and " + s2); int numTokens1 = 0, numTokens2 = 0, numCommonTokens = 0; HashSet set1 = new HashSet(); StringTokenizer tokenizer = new StringTokenizer(s1, "_"); while (tokenizer.hasMoreTokens()) { set1.add(tokenizer.nextToken()); numTokens1++; } tokenizer = new StringTokenizer(s2, "_"); while (tokenizer.hasMoreTokens()) { if (set1.contains(tokenizer.nextToken())) { numCommonTokens++; } numTokens2++; } double jaccSim = 0; if (numTokens1 + numTokens2 > 0) { jaccSim = (numCommonTokens + 0.0) / (numTokens1 + numTokens2 - numCommonTokens); } if (numTokens1 == 0 || numTokens2 == 0) { jaccSim = -1; // to indicate DONT_CARE_LINK } // System.out.println("Instances are:\n" + a + "\n" + b); // System.out.println("Jaccard sim of " + s1 + " and " + s2 + " = " + jaccSim); return jaccSim; } /** returns string representation of InstancePair */ public String toString() { String string = new String(); string = "[" + first + "," + second + ","; if (linkType == MUST_LINK) { string = string + "MUST,"; } else if (linkType == CANNOT_LINK) { string = string + "CANNOT,"; } else if (linkType == DONT_CARE_LINK) { string = string + "DONTCARE,"; } string += cost + "]"; return string; } }