VectorSpaceMetric.java example

Explorer
wekax-master
- weka-3-6-2
- wekaUT
  - GetAllSubPackages.java
  - weka
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    VectorSpaceMetric.java
 *    Copyright (C) 2001 Mikhail Bilenko, Raymond J. Mooney
 *
 */


package weka.deduping.metrics;

import java.util.*;
import java.io.Serializable;
import weka.core.*;

/**
 * This class uses a vector space to calculate similarity between two strings
 * Some code borrowed from ir.vsr package by Raymond J. Mooney
 *
 * @author MikhailBilenko
 */

public class VectorSpaceMetric extends StringMetric implements DataDependentStringMetric,
							       OptionHandler, Serializable {

  /** Strings are mapped to StringReferences in this hash */
  protected HashMap m_stringRefHash = null;

  /** A HashMap where tokens are indexed. Each indexed token maps
   * to a TokenInfo. */
  protected HashMap m_tokenHash = null;

  /** A list of all indexed strings.  Elements are StringReference's. */
  public ArrayList m_stringRefs = null;

  /** An underlying tokenizer that is used for converting strings
   * into HashMapVectors
   */
  protected Tokenizer m_tokenizer = new WordTokenizer();

  /** Should IDF weighting be used? */
  protected boolean m_useIDF = true;

  /** We can have different ways of converting from similarity to distance */
  public static final int CONVERSION_LAPLACIAN = 1;
  public static final int CONVERSION_UNIT = 2;
  public static final int CONVERSION_EXPONENTIAL = 4;
  public static final Tag[] TAGS_CONVERSION = {
    new Tag(CONVERSION_UNIT, "distance = 1-similarity"),
    new Tag(CONVERSION_LAPLACIAN, "distance=1/(1+similarity)"),
    new Tag(CONVERSION_EXPONENTIAL, "distance=exp(-similarity)")
      };
  /** The method of converting, by default laplacian */
  protected int m_conversionType = CONVERSION_EXPONENTIAL;

  
  /** Construct a vector space from a given set of examples
   * @param strings a list of strings from which the inverted index is
   * to be constructed
   */
  public VectorSpaceMetric() {
    m_stringRefHash = new HashMap();
    m_tokenHash = new HashMap();
    m_stringRefs = new ArrayList();
  }
  
  /** Given a list of strings, build the vector space
   */
  public void buildMetric(List strings) throws Exception {
    m_stringRefHash = new HashMap();
    m_tokenHash = new HashMap();
    // Loop, processing each of the examples
    Iterator stringIterator = strings.iterator();
    while (stringIterator.hasNext()) {
      String string = (String)stringIterator.next();
      // Create a document vector for this document
      HashMapVector vector = m_tokenizer.tokenize(string);
      vector.initLength();
      indexString(string, vector);
    }
    // Now that all strings have been processed, we can calculate the IDF weights for
    // all tokens and the resulting lengths of all weighted document vectors.
    computeIDFandStringLengths();
    System.out.println("Indexed " +  m_stringRefs.size() + " documents with " + size() + " unique terms.");
  }

  /** Index a given string using its corresponding vector */
  protected void indexString(String string, HashMapVector vector) {
    // Create a new reference
    StringReference strRef = new StringReference(string, vector);
    m_stringRefs.add(strRef);
    
    m_stringRefHash.put(string, strRef);
    // Iterate through each of the tokens in the document
    Iterator mapEntries = vector.iterator();
    while (mapEntries.hasNext()) {
      Map.Entry entry = (Map.Entry)mapEntries.next();
      // An entry in the HashMap maps a token to a Weight
      String token = (String)entry.getKey();
      // The count for the token is in the value of the Weight
      int count = (int)((Weight)entry.getValue()).getValue();
      // Add an occurence of this token to the inverted index pointing to this document
      indexToken(token, count, strRef);
    }
  }

  /** Add a token occurrence to the index.
   * @param token The token to index.
   * @param count The number of times it occurs in the document.
   * @param strRef A reference to the String it occurs in.
   */
  protected void indexToken(String token, int count, StringReference strRef) {
    // Find this token in the index
    TokenInfo tokenInfo = (TokenInfo)m_tokenHash.get(token);
    if (tokenInfo == null) {
      // If this is a new token, create info for it to put in the hashtable
      tokenInfo = new TokenInfo();
      m_tokenHash.put(token, tokenInfo);
    }
    // Add a new occurrence for this token to its info
    tokenInfo.occList.add(new TokenOccurrence(strRef, count));
  }

  /** Compute the IDF factor for every token in the index and the length
   * of the string vector for every string referenced in the index. */
  protected void computeIDFandStringLengths() {
    // Let N be the total number of documents indexed
    double N = m_stringRefs.size();
    // Iterate through each of the tokens in the index 
    Iterator mapEntries = m_tokenHash.entrySet().iterator();
    while (mapEntries.hasNext()) {
      // Get the token and the tokenInfo for each entry in the HashMap
      Map.Entry entry = (Map.Entry)mapEntries.next();
      String token = (String)entry.getKey();
      TokenInfo tokenInfo = (TokenInfo)entry.getValue();

      // Get the total number of strings in which this token occurs
      double numStringRefs = tokenInfo.occList.size(); 

      // Calculate the IDF factor for this token
      double idf = Math.log(N/numStringRefs);
      if (idf == 0.0) 
	// If IDF is 0, then just remove this inconsequential token from the index
	mapEntries.remove();
      else {
	tokenInfo.idf = idf;
	// In order to compute document vector lengths,  sum the
	// square of the weights (IDF * occurrence count) across
	// every token occurrence for each document.
	for(int i = 0; i < tokenInfo.occList.size(); i++) {
	  TokenOccurrence occ = (TokenOccurrence)tokenInfo.occList.get(i);
	  if (m_useIDF) { 
	    occ.m_stringRef.m_length = occ.m_stringRef.m_length + Math.pow(idf*occ.m_count, 2);
	  } else {
	    occ.m_stringRef.m_length = occ.m_stringRef.m_length + occ.m_count * occ.m_count;
	  }
	}
      }
    }
    // At this point, every document length should be the sum of the squares of
    // its token weights.  In order to calculate final lengths, just need to
    // set the length of every document reference to the square-root of this sum.
    for(int i = 0; i < m_stringRefs.size(); i++) {
      StringReference stringRef = (StringReference)m_stringRefs.get(i);
      stringRef.m_length = Math.sqrt(stringRef.m_length);
    }
  }
  

  /** Compute similarity between two strings
   * @param s1 first string
   * @param s2 second string
   * @returns similarity between two strings
   */
  public double similarity(String s1, String s2) {
    StringReference stringRef1 = (StringReference) m_stringRefHash.get(s1);
    StringReference stringRef2 = (StringReference) m_stringRefHash.get(s2);
    double length1 = stringRef1.m_length;
    double length2 = stringRef1.m_length;
    HashMapVector v1 = stringRef1.m_vector;
    HashMapVector v2 = stringRef2.m_vector;
    double similarity = 0;

    if (length1 == 0 || length2 == 0) {
      return 0;
    }

    Iterator mapEntries = v1.iterator();
    while (mapEntries.hasNext()) {
      // Get the token and the count for each token in the query
      Map.Entry entry = (Map.Entry)mapEntries.next();
      String token = (String)entry.getKey();
      if (v2.hashMap.containsKey(token)) {
	double count1 = ((Weight)entry.getValue()).getValue();
	double count2 = ((Weight)v2.hashMap.get(token)).getValue();
	TokenInfo tokenInfo = (TokenInfo) m_tokenHash.get(token);

	// add this component unless it was killed (with idf=0)
	if (tokenInfo != null) {
	  double increment = count1 * count2;
	  if (m_useIDF) {
	    increment *= tokenInfo.idf * tokenInfo.idf;
	  }
	  similarity += increment;
	}
      }
    }
    similarity /= length1 * length2;
    return similarity;
  }


  /** The computation of a metric can be either based on distance, or on similarity
   * @returns false because dot product fundamentally computes similarity
   */
  public boolean isDistanceBased() {
    return false;
  }

  
  /** Set the tokenizer to use
   * @param tokenizer the tokenizer that is used
   */
  public void setTokenizer(Tokenizer tokenizer) {
    m_tokenizer = tokenizer;
  }

  /** Get the tokenizer to use
   * @return the tokenizer that is used
   */
  public Tokenizer getTokenizer() {
    return m_tokenizer;
  }

  /** Turn IDF weighting on/off
   * @param useIDF if true, all token weights will be weighted by IDF
   */
  public void setUseIDF(boolean useIDF) {
    m_useIDF = useIDF;
  } 

  /** check whether IDF weighting is on/off
   * @return if true, all token weights are weighted by IDF
   */
  public boolean getUseIDF() {
    return m_useIDF;
  } 


  /** Return the number of tokens indexed.
   * @return the number of tokens indexed*/
  public int size() {
    return m_tokenHash.size();
  }

  /**
   * Returns distance between two strings using the current conversion
   * type (CONVERSION_LAPLACIAN, CONVERSION_EXPONENTIAL, CONVERSION_UNIT, ...)
   * @param string1 First string.
   * @param string2 Second string.
   * @exception Exception if distance could not be estimated.
   */
  public double distance (String string1, String string2) throws Exception {
    switch (m_conversionType) {
    case CONVERSION_LAPLACIAN: 
      return 1 / (1 + similarity(string1, string2));
    case CONVERSION_UNIT:
      return 2 * (1 - similarity(string1, string2));
    case CONVERSION_EXPONENTIAL:
      return Math.exp(-similarity(string1, string2));
    default:
      throw new Exception ("Unknown similarity to distance conversion method");
    }
  }

    /**
   * Set the type of similarity to distance conversion. Values other
   * than CONVERSION_LAPLACIAN, CONVERSION_UNIT, or CONVERSION_EXPONENTIAL will be ignored
   * 
   * @param type type of the similarity to distance conversion to use
   */
  public void setConversionType(SelectedTag conversionType) {
    if (conversionType.getTags() == TAGS_CONVERSION) {
      m_conversionType = conversionType.getSelectedTag().getID();
    }
  }

  /**
   * return the type of similarity to distance conversion
   * @return one of CONVERSION_LAPLACIAN, CONVERSION_UNIT, or CONVERSION_EXPONENTIAL
   */
  public SelectedTag getConversionType() {
    return new SelectedTag(m_conversionType, TAGS_CONVERSION);
  }

  /** Create a copy of this metric
   * @return another VectorSpaceMetric with the same exact parameters as this  metric
   */
  public Object clone() {
    VectorSpaceMetric metric = new VectorSpaceMetric();
    metric.setConversionType(new SelectedTag(m_conversionType, TAGS_CONVERSION));
    metric.setTokenizer(m_tokenizer);
    metric.setUseIDF(m_useIDF);
    return metric;
  }

    /**
   * Gets the current settings of NGramTokenizer.
   *
   * @return an array of strings suitable for passing to setOptions()
   */
  public String [] getOptions() {
    String [] options = new String [20];
    int current = 0;

    if (m_conversionType == CONVERSION_EXPONENTIAL) {
      options[current++] = "-E";
    } else if (m_conversionType == CONVERSION_UNIT) {
      options[current++] = "-U";
    }

    if (m_useIDF) {
      options[current++] = "-I";
    }

    options[current++] = "-T";
    options[current++] = Utils.removeSubstring(m_tokenizer.getClass().getName(), "weka.deduping.metrics.");
    if (m_tokenizer instanceof OptionHandler) {
	String[] tokenizerOptions = ((OptionHandler)m_tokenizer).getOptions();
	for (int i = 0; i < tokenizerOptions.length; i++) {
	  options[current++] = tokenizerOptions[i];
	}
      }
    
    while (current < options.length) {
      options[current++] = "";
    }
    return options;
  }


  /**
   * Parses a given list of options. Valid options are:<p>
   *
   * -S use stemming
   * -R remove stopwords
   * -N gram size
   */
  public void setOptions(String[] options) throws Exception {
    // TODO
  }

  /**
   * Returns an enumeration describing the available options.
   *
   * @return an enumeration of all the available options.
   */
  public Enumeration listOptions() {
    Vector newVector = new Vector(0);

    return newVector.elements();
  }
}