/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * Blocking.java * Copyright (C) 2003 Mikhail Bilenko * */ package weka.deduping.blocking; import java.util.*; import java.io.Serializable; import weka.core.*; import java.text.SimpleDateFormat; import weka.deduping.metrics.*; import weka.deduping.*; /** * This class takes a set of records, amalgamates them into single * strings and creates an inverted index for that collection. It then * can return the pairs of strings that are most alike. Largely * borrowed from VectorSpaceMetric. * * @author Mikhail Bilenko */ public class Blocking implements OptionHandler, Serializable { /** The dataset that contains the instances */ protected Instances m_instances = null; /** Strings are mapped to StringReferences in this hash */ protected HashMap m_instanceRefHash = null; /** A HashMap where tokens are indexed. Each indexed token maps * to a TokenInfo. */ protected HashMap m_tokenHash = null; /** A TreeSet where the InstancePairs are stored for subsequent retrieval */ protected TreeSet m_pairSet = new TreeSet(new InstancePairComparator()); class InstancePairComparator implements java.util.Comparator { public InstancePairComparator() {} public int compare(Object o1, Object o2) { // InstancePairs implement Comparable! int result = ((Comparable)o1).compareTo(o2); if (result != 0) { return -result; } else { // ties are resolved in a very ad hoc way: comparing values of attributes of the first pair... TODO: a better way? InstancePair p1 = (InstancePair) o1; for (int i = 0; i < p1.instance1.numValues(); i++) { double v1 = p1.instance1.value(i); double v2 = p1.instance2.value(i); if (v1 != v2) { return ((v1-v2) > 0) ? 1 : -1; } } InstancePair p2 = (InstancePair) o2; for (int i = 0; i < p2.instance1.numValues(); i++) { double v1 = p2.instance1.value(i); double v2 = p2.instance2.value(i); if (v1 != v2) { return ((v1-v2) > 0) ? 1 : -1; } } for (int i = 0; i < p1.instance1.numValues(); i++) { double v1 = p1.instance1.value(i); double v2 = p2.instance1.value(i); if (v1 != v2) { return ((v1-v2) > 0) ? 1 : -1; } } System.err.println("WTF"); String s1 = p1.instance1.stringValue(0); return (s1.charAt(0) > s1.charAt(1)) ? 1 : -1; } } }; /** A list of all indexed instance. Elements are InstanceReference's. */ public ArrayList m_instanceRefs = null; /** An underlying tokenizer that is used for converting strings * into HashMapVectors */ protected Tokenizer m_tokenizer = new WordTokenizer(); /** Should IDF weighting be used? */ protected boolean m_useIDF = true; /** Construct a vector space from a given set of examples * @param strings a list of strings from which the inverted index is * to be constructed */ public Blocking() { m_instanceRefHash = new HashMap(); m_tokenHash = new HashMap(); m_instanceRefs = new ArrayList(); } /** Given a list of strings, build the vector space */ public void buildIndex(Instances instances) throws Exception { m_instances = instances; m_instanceRefHash = new HashMap(); m_tokenHash = new HashMap(); int classIndex = instances.classIndex(); for (int i = 0; i < instances.numInstances(); i++) { Instance instance = instances.instance(i); StringBuffer buffer = new StringBuffer(); for (int j = 0; j < instance.numAttributes(); j++) { if (j != classIndex) { buffer.append(instance.stringValue(j)).append(" "); } } // Create a document vector for this document String string = buffer.toString(); HashMapVector vector = m_tokenizer.tokenize(string); vector.initLength(); indexInstance(instance, i, string, vector); } // Now that all instances have been processed, we can calculate the IDF weights for // all tokens and the resulting lengths of all weighted document vectors. computeIDFandStringLengths(); System.out.println(getTimestamp() + " Indexed " + m_instanceRefs.size() + " documents with " + size() + " unique terms."); createPairSet(); System.out.println(getTimestamp() + " Created a set with " + m_pairSet.size() + " pairs"); } /** Index a given Instance using its corresponding vector */ protected void indexInstance(Instance instance, int idx, String string, HashMapVector vector) { // Create a new reference InstanceReference instRef = new InstanceReference(instance, idx, string, vector); m_instanceRefs.add(instRef); m_instanceRefHash.put(instance, instRef); // Iterate through each of the tokens in the document Iterator mapEntries = vector.iterator(); while (mapEntries.hasNext()) { Map.Entry entry = (Map.Entry)mapEntries.next(); // An entry in the HashMap maps a token to a Weight String token = (String)entry.getKey(); // The count for the token is in the value of the Weight int count = (int)((Weight)entry.getValue()).getValue(); // Add an occurence of this token to the inverted index pointing to this document indexToken(token, count, instRef); } } /** Add a token occurrence to the index. * @param token The token to index. * @param count The number of times it occurs in the document. * @param instRef A reference to the Instance it occurs in. */ protected void indexToken(String token, int count, InstanceReference instRef) { // Find this token in the index TokenInfo tokenInfo = (TokenInfo)m_tokenHash.get(token); if (tokenInfo == null) { // If this is a new token, create info for it to put in the hashtable tokenInfo = new TokenInfo(); m_tokenHash.put(token, tokenInfo); } // Add a new occurrence for this token to its info tokenInfo.occList.add(new TokenInstanceOccurrence(instRef, count)); } /** Compute the IDF factor for every token in the index and the length * of the string vector for every string referenced in the index. */ protected void computeIDFandStringLengths() { // Let N be the total number of documents indexed double N = m_instanceRefs.size(); // Iterate through each of the tokens in the index Iterator mapEntries = m_tokenHash.entrySet().iterator(); while (mapEntries.hasNext()) { // Get the token and the tokenInfo for each entry in the HashMap Map.Entry entry = (Map.Entry)mapEntries.next(); String token = (String)entry.getKey(); TokenInfo tokenInfo = (TokenInfo)entry.getValue(); // Get the total number of strings in which this token occurs double numInstanceRefs = tokenInfo.occList.size(); // Calculate the IDF factor for this token double idf = Math.log(N/numInstanceRefs); if (idf == 0.0) // If IDF is 0, then just remove this inconsequential token from the index mapEntries.remove(); else { tokenInfo.idf = idf; // In order to compute document vector lengths, sum the // square of the weights (IDF * occurrence count) across // every token occurrence for each document. for(int i = 0; i < tokenInfo.occList.size(); i++) { TokenInstanceOccurrence occ = (TokenInstanceOccurrence)tokenInfo.occList.get(i); if (m_useIDF) { occ.instanceRef.length = occ.instanceRef.length + Math.pow(idf*occ.count, 2); } else { occ.instanceRef.length = occ.instanceRef.length + occ.count * occ.count; } } } } // At this point, every document length should be the sum of the squares of // its token weights. In order to calculate final lengths, just need to // set the length of every document reference to the square-root of this sum. for(int i = 0; i < m_instanceRefs.size(); i++) { InstanceReference instanceRef = (InstanceReference)m_instanceRefs.get(i); instanceRef.length = Math.sqrt(instanceRef.length); } } /** Populate m_pairSet with all the instancePairs that contain common tokens, so that * they can be retrieved in the order of decreasing similarity later */ public void createPairSet() { HashSet processedPairSet = new HashSet(); // Iterate through each of the tokens in the index, getting instances containing them Iterator mapEntries = m_tokenHash.entrySet().iterator(); while (mapEntries.hasNext()) { // Get the token and the tokenInfo for each entry in the HashMap Map.Entry entry = (Map.Entry)mapEntries.next(); String token = (String)entry.getKey(); TokenInfo tokenInfo = (TokenInfo)entry.getValue(); // Get the total number of strings in which this token occurs int numInstanceRefs = tokenInfo.occList.size(); // if more than 1, compare pair and add to the index if (numInstanceRefs > 1) { for (int i = 0; i < numInstanceRefs; i++) { InstanceReference instRef1 = ((TokenInstanceOccurrence) tokenInfo.occList.get(i)).instanceRef; for (int j = i+1; j < numInstanceRefs; j++) { InstanceReference instRef2 = ((TokenInstanceOccurrence) tokenInfo.occList.get(j)).instanceRef; Integer hashValue1 = new Integer(instRef1.idx * m_instances.numInstances() + instRef2.idx); Integer hashValue2 = new Integer(instRef2.idx * m_instances.numInstances() + instRef1.idx); // if the similarity for this pair of instances has not been calculated before, calculate and store if (!processedPairSet.contains(hashValue1)) { double sim = similarity(instRef1, instRef2); InstancePair pair = new InstancePair(instRef1.instance, instRef2.instance, (instRef1.instance.classValue() == instRef2.instance.classValue()), sim); m_pairSet.add(pair); processedPairSet.add(hashValue1); processedPairSet.add(hashValue2); } } } } } } /** Compute similarity between two strings * @param s1 first string * @param s2 second string * @returns similarity between two strings */ public double similarity(InstanceReference iRef1, InstanceReference iRef2) { double length1 = iRef1.length; double length2 = iRef1.length; HashMapVector v1 = iRef1.vector; HashMapVector v2 = iRef2.vector; double similarity = 0; if (length1 == 0 || length2 == 0) { return 0; } Iterator mapEntries = v1.iterator(); while (mapEntries.hasNext()) { // Get the token and the count for each token in the query Map.Entry entry = (Map.Entry)mapEntries.next(); String token = (String)entry.getKey(); if (v2.hashMap.containsKey(token)) { double count1 = ((Weight)entry.getValue()).getValue(); double count2 = ((Weight)v2.hashMap.get(token)).getValue(); TokenInfo tokenInfo = (TokenInfo) m_tokenHash.get(token); // add this component unless it was killed (with idf=0) if (tokenInfo != null) { double increment = count1 * count2; if (m_useIDF) { increment *= tokenInfo.idf * tokenInfo.idf; } similarity += increment; } } } similarity /= length1 * length2; return similarity; } /** Return n most similar pairs */ public InstancePair[] getMostSimilarPairs(int numPairs) { Iterator iterator = m_pairSet.iterator(); int i = 0; InstancePair [] pairs = new InstancePair[numPairs]; while (iterator.hasNext() && i < numPairs) { InstancePair pair = (InstancePair) iterator.next(); pairs[i++] = pair; // System.out.println(pair.value + "\t" + pair.positive); } return pairs; } /** Return the number of tokens indexed. * @return the number of tokens indexed*/ public int size() { return m_tokenHash.size(); } /** Set the tokenizer to use * @param tokenizer the tokenizer that is used */ public void setTokenizer(Tokenizer tokenizer) { m_tokenizer = tokenizer; } /** Get the tokenizer to use * @return the tokenizer that is used */ public Tokenizer getTokenizer() { return m_tokenizer; } /** Turn IDF weighting on/off * @param useIDF if true, all token weights will be weighted by IDF */ public void setUseIDF(boolean useIDF) { m_useIDF = useIDF; } /** check whether IDF weighting is on/off * @return if true, all token weights are weighted by IDF */ public boolean getUseIDF() { return m_useIDF; } /** * Gets the current settings of Blocking * * @return an array of strings suitable for passing to setOptions() */ public String [] getOptions() { String [] options = new String [1]; int current = 0; while (current < options.length) { options[current++] = ""; } return options; } /** * Gets a string containing current date and time. * * @return a string containing the date and time. */ protected static String getTimestamp() { return (new SimpleDateFormat("HH:mm:ss:")).format(new Date()); } /** * Parses a given list of options. Valid options are:<p> */ public void setOptions(String[] options) throws Exception { } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector newVector = new Vector(0); return newVector.elements(); } }