JaccardMetric.java example

Explorer
wekax-master
- weka-3-6-2
- wekaUT
  - GetAllSubPackages.java
  - weka
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    JaccardMetric.java
 *    Copyright (C) 2001 Mikhail Bilenko, Raymond J. Mooney
 *
 */


package weka.deduping.metrics;

import java.util.*;
import java.io.Serializable;
import weka.core.*;

/**
 * This class claculates  similarity between two strings using the Jaccard metric
 * Some code borrowed from ir.vsr package by Raymond J. Mooney
 *
 * @author MikhailBilenko
 */

public class JaccardMetric extends StringMetric implements DataDependentStringMetric, OptionHandler, Serializable {

  /** Strings are mapped to StringReferences in this hash */
  protected HashMap m_stringRefHash = null;

  /** A HashMap where tokens are indexed. Each indexed token maps
   * to a TokenInfo. */
  protected HashMap m_tokenHash = null;

  /** A list of all indexed strings.  Elements are StringReference's. */
  public ArrayList m_stringRefs = null;

  /** An underlying tokenizer that is used for converting strings
   * into HashMapVectors
   */
  protected Tokenizer m_tokenizer = new WordTokenizer();


    /** We can have different ways of converting from similarity to distance */
  public static final int CONVERSION_LAPLACIAN = 1;
  public static final int CONVERSION_UNIT = 2;
  public static final int CONVERSION_EXPONENTIAL = 4;
  public static final Tag[] TAGS_CONVERSION = {
    new Tag(CONVERSION_UNIT, "distance = 1-similarity"),
    new Tag(CONVERSION_LAPLACIAN, "distance=1/(1+similarity)"),
    new Tag(CONVERSION_EXPONENTIAL, "distance=exp(-similarity)")
      };
  /** The method of converting, by default laplacian */
  protected int m_conversionType = CONVERSION_LAPLACIAN;

  
  /** Construct a vector space from a given set of examples
   * @param strings a list of strings from which the inverted index is
   * to be constructed
   */
  public JaccardMetric() {
    m_stringRefHash = new HashMap();
    m_tokenHash = new HashMap();
    m_stringRefs = new ArrayList();
  }
  
  /** Given a list of strings, build the vector space
   */
  public void buildMetric(List strings) throws Exception {
    m_stringRefHash = new HashMap();
    m_tokenHash = new HashMap();
    // Loop, processing each of the examples
    Iterator stringIterator = strings.iterator();
    while (stringIterator.hasNext()) {
      String string = (String)stringIterator.next();
      // Create a document vector for this document
      HashMapVector vector = m_tokenizer.tokenize(string);
      vector.initLength();
      indexString(string, vector);
    }
    System.out.println("Indexed " +  m_stringRefs.size() + " documents with " + size() + " unique terms.");
  }

  /** Index a given string using its corresponding vector */
  protected void indexString(String string, HashMapVector vector) {
    // Create a new reference
    StringReference strRef = new StringReference(string, vector);
    m_stringRefs.add(strRef);
    
    m_stringRefHash.put(string, strRef);
    // Iterate through each of the tokens in the document
    Iterator mapEntries = vector.iterator();
    while (mapEntries.hasNext()) {
      Map.Entry entry = (Map.Entry)mapEntries.next();
      // An entry in the HashMap maps a token to a Weight
      String token = (String)entry.getKey();
      // The count for the token is in the value of the Weight
      int count = (int)((Weight)entry.getValue()).getValue();
      // Add an occurence of this token to the inverted index pointing to this document
      indexToken(token, count, strRef);
    }
  }

  /** Add a token occurrence to the index.
   * @param token The token to index.
   * @param count The number of times it occurs in the document.
   * @param strRef A reference to the String it occurs in.
   */
  protected void indexToken(String token, int count, StringReference strRef) {
    // Find this token in the index
    TokenInfo tokenInfo = (TokenInfo)m_tokenHash.get(token);
    if (tokenInfo == null) {
      // If this is a new token, create info for it to put in the hashtable
      tokenInfo = new TokenInfo();
      m_tokenHash.put(token, tokenInfo);
    }
    // Add a new occurrence for this token to its info
    tokenInfo.occList.add(new TokenOccurrence(strRef, count));
  }

  /** Compute similarity between two strings
   * @param s1 first string
   * @param s2 second string
   * @returns similarity between two strings
   */
  public double similarity(String s1, String s2) {
    StringReference stringRef1 = (StringReference) m_stringRefHash.get(s1);
    StringReference stringRef2 = (StringReference) m_stringRefHash.get(s2);
    HashMapVector v1 = stringRef1.m_vector;
    HashMapVector v2 = stringRef2.m_vector;
    int common = 0;
    Iterator mapEntries = v1.iterator();
    while (mapEntries.hasNext()) {
      Map.Entry entry = (Map.Entry)mapEntries.next();
      String token = (String)entry.getKey();
      if (v2.hashMap.containsKey(token)) {
	common++;
      }
    }
    // get the StringRefs for each of the two strings
    int l1 = v1.size();
    int l2 = v2.size();
    double jaccard = 0;
    if (l1 + l2 != 0) {
      jaccard = (common + 0.0)/(0.0 + Math.max(l1, l2));
    }
    return jaccard;
  }


  /** The computation of a metric can be either based on distance, or on similarity
   * @returns false because dot product fundamentally computes similarity
   */
  public boolean isDistanceBased() {
    return false;
  }

  
  /** Set the tokenizer to use
   * @param tokenizer the tokenizer that is used
   */
  public void setTokenizer(Tokenizer tokenizer) {
    m_tokenizer = tokenizer;
  }

  /** Get the tokenizer to use
   * @return the tokenizer that is used
   */
  public Tokenizer getTokenizer() {
    return m_tokenizer;
  }

  /** Return the number of tokens indexed.
   * @return the number of tokens indexed*/
  public int size() {
    return m_tokenHash.size();
  }

//===
 public double distance1(String s1, String s2) throws Exception {
	TreeMap hash = new TreeMap();
	
	StringTokenizer parser = new StringTokenizer(s1, " ,()[]/.;:");
	while (parser.hasMoreTokens()) {
	    String token = parser.nextToken();
	    hash.put(token, token);
	}
	parser = new StringTokenizer(s2, " ,()[]/.;:");
	int common = 0, unique = 0;
	while (parser.hasMoreTokens()) {
	    String token = parser.nextToken();
	    if (hash.containsKey(token)) {
		common++;
		hash.remove(token);
	    } else {
		unique++;
	    }
	}
	unique += hash.size();
	return (double)unique/((double)unique + common);
    }

//===

  

  /**
   * Returns distance between two strings using the current conversion
   * type (CONVERSION_LAPLACIAN, CONVERSION_EXPONENTIAL, CONVERSION_UNIT, ...)
   * @param string1 First string.
   * @param string2 Second string.
   * @exception Exception if distance could not be estimated.
   */
  public double distance (String string1, String string2) throws Exception {
    switch (m_conversionType) {
    case CONVERSION_LAPLACIAN: 
      return 1 / (1 + similarity(string1, string2));
    case CONVERSION_UNIT:
      return 2 * (1 - similarity(string1, string2));
    case CONVERSION_EXPONENTIAL:
      return Math.exp(-similarity(string1, string2));
    default:
      throw new Exception ("Unknown similarity to distance conversion method");
    }
  }

    /**
   * Set the type of similarity to distance conversion. Values other
   * than CONVERSION_LAPLACIAN, CONVERSION_UNIT, or CONVERSION_EXPONENTIAL will be ignored
   * 
   * @param type type of the similarity to distance conversion to use
   */
  public void setConversionType(SelectedTag conversionType) {
    if (conversionType.getTags() == TAGS_CONVERSION) {
      m_conversionType = conversionType.getSelectedTag().getID();
    }
  }

  /**
   * return the type of similarity to distance conversion
   * @return one of CONVERSION_LAPLACIAN, CONVERSION_UNIT, or CONVERSION_EXPONENTIAL
   */
  public SelectedTag getConversionType() {
    return new SelectedTag(m_conversionType, TAGS_CONVERSION);
  }

  /** Create a copy of this metric
   * @return another JaccardMetric with the same exact parameters as this  metric
   */
  public Object clone() {
    JaccardMetric metric = new JaccardMetric();
    metric.setConversionType(new SelectedTag(m_conversionType, TAGS_CONVERSION));
    metric.setTokenizer(m_tokenizer); 
    return metric;
  }

    /**
   * Gets the current settings of NGramTokenizer.
   *
   * @return an array of strings suitable for passing to setOptions()
   */
  public String [] getOptions() {
    String [] options = new String [20];
    int current = 0;

    if (m_conversionType == CONVERSION_EXPONENTIAL) {
      options[current++] = "-E";
    } else if (m_conversionType == CONVERSION_UNIT) {
      options[current++] = "-U";
    }

    options[current++] = "-T";
    options[current++] = Utils.removeSubstring(m_tokenizer.getClass().getName(), "weka.deduping.metrics.");
    if (m_tokenizer instanceof OptionHandler) {
	String[] tokenizerOptions = ((OptionHandler)m_tokenizer).getOptions();
	for (int i = 0; i < tokenizerOptions.length; i++) {
	  options[current++] = tokenizerOptions[i];
	}
      }
    
    while (current < options.length) {
      options[current++] = "";
    }
    return options;
  }


  /**
   * Parses a given list of options. Valid options are:<p>
   *
   * -S use stemming
   * -R remove stopwords
   * -N gram size
   */
  public void setOptions(String[] options) throws Exception {
    // TODO
  }

  /**
   * Returns an enumeration describing the available options.
   *
   * @return an enumeration of all the available options.
   */
  public Enumeration listOptions() {
    Vector newVector = new Vector(0);

    return newVector.elements();
  }
}