/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * VectorSpaceMetric.java * Copyright (C) 2001 Mikhail Bilenko, Raymond J. Mooney * */ package weka.deduping.metrics; import java.util.*; import java.io.Serializable; import weka.core.*; /** * This class uses a vector space to calculate similarity between two strings * Some code borrowed from ir.vsr package by Raymond J. Mooney * * @author MikhailBilenko */ public class VectorSpaceMetric extends StringMetric implements DataDependentStringMetric, OptionHandler, Serializable { /** Strings are mapped to StringReferences in this hash */ protected HashMap m_stringRefHash = null; /** A HashMap where tokens are indexed. Each indexed token maps * to a TokenInfo. */ protected HashMap m_tokenHash = null; /** A list of all indexed strings. Elements are StringReference's. */ public ArrayList m_stringRefs = null; /** An underlying tokenizer that is used for converting strings * into HashMapVectors */ protected Tokenizer m_tokenizer = new WordTokenizer(); /** Should IDF weighting be used? */ protected boolean m_useIDF = true; /** We can have different ways of converting from similarity to distance */ public static final int CONVERSION_LAPLACIAN = 1; public static final int CONVERSION_UNIT = 2; public static final int CONVERSION_EXPONENTIAL = 4; public static final Tag[] TAGS_CONVERSION = { new Tag(CONVERSION_UNIT, "distance = 1-similarity"), new Tag(CONVERSION_LAPLACIAN, "distance=1/(1+similarity)"), new Tag(CONVERSION_EXPONENTIAL, "distance=exp(-similarity)") }; /** The method of converting, by default laplacian */ protected int m_conversionType = CONVERSION_EXPONENTIAL; /** Construct a vector space from a given set of examples * @param strings a list of strings from which the inverted index is * to be constructed */ public VectorSpaceMetric() { m_stringRefHash = new HashMap(); m_tokenHash = new HashMap(); m_stringRefs = new ArrayList(); } /** Given a list of strings, build the vector space */ public void buildMetric(List strings) throws Exception { m_stringRefHash = new HashMap(); m_tokenHash = new HashMap(); // Loop, processing each of the examples Iterator stringIterator = strings.iterator(); while (stringIterator.hasNext()) { String string = (String)stringIterator.next(); // Create a document vector for this document HashMapVector vector = m_tokenizer.tokenize(string); vector.initLength(); indexString(string, vector); } // Now that all strings have been processed, we can calculate the IDF weights for // all tokens and the resulting lengths of all weighted document vectors. computeIDFandStringLengths(); System.out.println("Indexed " + m_stringRefs.size() + " documents with " + size() + " unique terms."); } /** Index a given string using its corresponding vector */ protected void indexString(String string, HashMapVector vector) { // Create a new reference StringReference strRef = new StringReference(string, vector); m_stringRefs.add(strRef); m_stringRefHash.put(string, strRef); // Iterate through each of the tokens in the document Iterator mapEntries = vector.iterator(); while (mapEntries.hasNext()) { Map.Entry entry = (Map.Entry)mapEntries.next(); // An entry in the HashMap maps a token to a Weight String token = (String)entry.getKey(); // The count for the token is in the value of the Weight int count = (int)((Weight)entry.getValue()).getValue(); // Add an occurence of this token to the inverted index pointing to this document indexToken(token, count, strRef); } } /** Add a token occurrence to the index. * @param token The token to index. * @param count The number of times it occurs in the document. * @param strRef A reference to the String it occurs in. */ protected void indexToken(String token, int count, StringReference strRef) { // Find this token in the index TokenInfo tokenInfo = (TokenInfo)m_tokenHash.get(token); if (tokenInfo == null) { // If this is a new token, create info for it to put in the hashtable tokenInfo = new TokenInfo(); m_tokenHash.put(token, tokenInfo); } // Add a new occurrence for this token to its info tokenInfo.occList.add(new TokenOccurrence(strRef, count)); } /** Compute the IDF factor for every token in the index and the length * of the string vector for every string referenced in the index. */ protected void computeIDFandStringLengths() { // Let N be the total number of documents indexed double N = m_stringRefs.size(); // Iterate through each of the tokens in the index Iterator mapEntries = m_tokenHash.entrySet().iterator(); while (mapEntries.hasNext()) { // Get the token and the tokenInfo for each entry in the HashMap Map.Entry entry = (Map.Entry)mapEntries.next(); String token = (String)entry.getKey(); TokenInfo tokenInfo = (TokenInfo)entry.getValue(); // Get the total number of strings in which this token occurs double numStringRefs = tokenInfo.occList.size(); // Calculate the IDF factor for this token double idf = Math.log(N/numStringRefs); if (idf == 0.0) // If IDF is 0, then just remove this inconsequential token from the index mapEntries.remove(); else { tokenInfo.idf = idf; // In order to compute document vector lengths, sum the // square of the weights (IDF * occurrence count) across // every token occurrence for each document. for(int i = 0; i < tokenInfo.occList.size(); i++) { TokenOccurrence occ = (TokenOccurrence)tokenInfo.occList.get(i); if (m_useIDF) { occ.m_stringRef.m_length = occ.m_stringRef.m_length + Math.pow(idf*occ.m_count, 2); } else { occ.m_stringRef.m_length = occ.m_stringRef.m_length + occ.m_count * occ.m_count; } } } } // At this point, every document length should be the sum of the squares of // its token weights. In order to calculate final lengths, just need to // set the length of every document reference to the square-root of this sum. for(int i = 0; i < m_stringRefs.size(); i++) { StringReference stringRef = (StringReference)m_stringRefs.get(i); stringRef.m_length = Math.sqrt(stringRef.m_length); } } /** Compute similarity between two strings * @param s1 first string * @param s2 second string * @returns similarity between two strings */ public double similarity(String s1, String s2) { StringReference stringRef1 = (StringReference) m_stringRefHash.get(s1); StringReference stringRef2 = (StringReference) m_stringRefHash.get(s2); double length1 = stringRef1.m_length; double length2 = stringRef1.m_length; HashMapVector v1 = stringRef1.m_vector; HashMapVector v2 = stringRef2.m_vector; double similarity = 0; if (length1 == 0 || length2 == 0) { return 0; } Iterator mapEntries = v1.iterator(); while (mapEntries.hasNext()) { // Get the token and the count for each token in the query Map.Entry entry = (Map.Entry)mapEntries.next(); String token = (String)entry.getKey(); if (v2.hashMap.containsKey(token)) { double count1 = ((Weight)entry.getValue()).getValue(); double count2 = ((Weight)v2.hashMap.get(token)).getValue(); TokenInfo tokenInfo = (TokenInfo) m_tokenHash.get(token); // add this component unless it was killed (with idf=0) if (tokenInfo != null) { double increment = count1 * count2; if (m_useIDF) { increment *= tokenInfo.idf * tokenInfo.idf; } similarity += increment; } } } similarity /= length1 * length2; return similarity; } /** The computation of a metric can be either based on distance, or on similarity * @returns false because dot product fundamentally computes similarity */ public boolean isDistanceBased() { return false; } /** Set the tokenizer to use * @param tokenizer the tokenizer that is used */ public void setTokenizer(Tokenizer tokenizer) { m_tokenizer = tokenizer; } /** Get the tokenizer to use * @return the tokenizer that is used */ public Tokenizer getTokenizer() { return m_tokenizer; } /** Turn IDF weighting on/off * @param useIDF if true, all token weights will be weighted by IDF */ public void setUseIDF(boolean useIDF) { m_useIDF = useIDF; } /** check whether IDF weighting is on/off * @return if true, all token weights are weighted by IDF */ public boolean getUseIDF() { return m_useIDF; } /** Return the number of tokens indexed. * @return the number of tokens indexed*/ public int size() { return m_tokenHash.size(); } /** * Returns distance between two strings using the current conversion * type (CONVERSION_LAPLACIAN, CONVERSION_EXPONENTIAL, CONVERSION_UNIT, ...) * @param string1 First string. * @param string2 Second string. * @exception Exception if distance could not be estimated. */ public double distance (String string1, String string2) throws Exception { switch (m_conversionType) { case CONVERSION_LAPLACIAN: return 1 / (1 + similarity(string1, string2)); case CONVERSION_UNIT: return 2 * (1 - similarity(string1, string2)); case CONVERSION_EXPONENTIAL: return Math.exp(-similarity(string1, string2)); default: throw new Exception ("Unknown similarity to distance conversion method"); } } /** * Set the type of similarity to distance conversion. Values other * than CONVERSION_LAPLACIAN, CONVERSION_UNIT, or CONVERSION_EXPONENTIAL will be ignored * * @param type type of the similarity to distance conversion to use */ public void setConversionType(SelectedTag conversionType) { if (conversionType.getTags() == TAGS_CONVERSION) { m_conversionType = conversionType.getSelectedTag().getID(); } } /** * return the type of similarity to distance conversion * @return one of CONVERSION_LAPLACIAN, CONVERSION_UNIT, or CONVERSION_EXPONENTIAL */ public SelectedTag getConversionType() { return new SelectedTag(m_conversionType, TAGS_CONVERSION); } /** Create a copy of this metric * @return another VectorSpaceMetric with the same exact parameters as this metric */ public Object clone() { VectorSpaceMetric metric = new VectorSpaceMetric(); metric.setConversionType(new SelectedTag(m_conversionType, TAGS_CONVERSION)); metric.setTokenizer(m_tokenizer); metric.setUseIDF(m_useIDF); return metric; } /** * Gets the current settings of NGramTokenizer. * * @return an array of strings suitable for passing to setOptions() */ public String [] getOptions() { String [] options = new String [20]; int current = 0; if (m_conversionType == CONVERSION_EXPONENTIAL) { options[current++] = "-E"; } else if (m_conversionType == CONVERSION_UNIT) { options[current++] = "-U"; } if (m_useIDF) { options[current++] = "-I"; } options[current++] = "-T"; options[current++] = Utils.removeSubstring(m_tokenizer.getClass().getName(), "weka.deduping.metrics."); if (m_tokenizer instanceof OptionHandler) { String[] tokenizerOptions = ((OptionHandler)m_tokenizer).getOptions(); for (int i = 0; i < tokenizerOptions.length; i++) { options[current++] = tokenizerOptions[i]; } } while (current < options.length) { options[current++] = ""; } return options; } /** * Parses a given list of options. Valid options are:<p> * * -S use stemming * -R remove stopwords * -N gram size */ public void setOptions(String[] options) throws Exception { // TODO } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector newVector = new Vector(0); return newVector.elements(); } }