Blocking.java example

Explorer
wekax-master
- weka-3-6-2
- wekaUT
  - GetAllSubPackages.java
  - weka
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    Blocking.java
 *    Copyright (C) 2003 Mikhail Bilenko
 *
 */


package weka.deduping.blocking;

import java.util.*;
import java.io.Serializable;
import weka.core.*;
import java.text.SimpleDateFormat;

import weka.deduping.metrics.*;
import weka.deduping.*;

/**
 * This class takes a set of records, amalgamates them into single
 * strings and creates an inverted index for that collection.  It then
 * can return the pairs of strings that are most alike.  Largely
 * borrowed from VectorSpaceMetric.
 *
 * @author Mikhail Bilenko
 */
public class Blocking implements OptionHandler, Serializable {
  /** The dataset that contains the instances */ 
  protected Instances m_instances = null;
  
  /** Strings are mapped to StringReferences in this hash */
  protected HashMap m_instanceRefHash = null;

  /** A HashMap where tokens are indexed. Each indexed token maps
   * to a TokenInfo. */
  protected HashMap m_tokenHash = null;

  /** A TreeSet where the InstancePairs are stored for subsequent retrieval */
  protected TreeSet m_pairSet = new TreeSet(new InstancePairComparator());

  class InstancePairComparator implements java.util.Comparator {
    public InstancePairComparator() {}
    public int compare(Object o1, Object o2) {
      // InstancePairs implement Comparable!
      int result = ((Comparable)o1).compareTo(o2);
      if (result != 0) {
	return -result;
      } else {  // ties are resolved in a very ad hoc way:  comparing values of attributes of the first pair... TODO: a better way?
	InstancePair p1 = (InstancePair) o1;
	for (int i = 0; i < p1.instance1.numValues(); i++) {
	  double v1 = p1.instance1.value(i);
	  double v2 = p1.instance2.value(i);
	  if (v1 != v2) {
	    return ((v1-v2) > 0) ? 1 : -1;
	  } 
	}

	InstancePair p2 = (InstancePair) o2;
	for (int i = 0; i < p2.instance1.numValues(); i++) {
	  double v1 = p2.instance1.value(i);
	  double v2 = p2.instance2.value(i);
	  if (v1 != v2) {
	    return ((v1-v2) > 0) ? 1 : -1;
	  } 
	}

	for (int i = 0; i < p1.instance1.numValues(); i++) {
	  double v1 = p1.instance1.value(i);
	  double v2 = p2.instance1.value(i);
	  if (v1 != v2) {
	    return ((v1-v2) > 0) ? 1 : -1;
	  } 
	}

	System.err.println("WTF");
	String s1 = p1.instance1.stringValue(0);
	return (s1.charAt(0) > s1.charAt(1)) ? 1 : -1; 
      }
    }
  };


  /** A list of all indexed instance.  Elements are InstanceReference's. */
  public ArrayList m_instanceRefs = null;

  /** An underlying tokenizer that is used for converting strings
   * into HashMapVectors
   */
  protected Tokenizer m_tokenizer = new WordTokenizer();

  /** Should IDF weighting be used? */
  protected boolean m_useIDF = true;

  /** Construct a vector space from a given set of examples
   * @param strings a list of strings from which the inverted index is
   * to be constructed
   */
  public Blocking() {
    m_instanceRefHash = new HashMap();
    m_tokenHash = new HashMap();
    m_instanceRefs = new ArrayList();
  }

   /** Given a list of strings, build the vector space
   */
  public void buildIndex(Instances instances) throws Exception {
    m_instances = instances; 
    m_instanceRefHash = new HashMap();
    m_tokenHash = new HashMap();
    int classIndex = instances.classIndex();

    for (int i = 0; i < instances.numInstances(); i++) { 
      Instance instance = instances.instance(i);
      StringBuffer buffer = new StringBuffer();
      for (int j = 0; j < instance.numAttributes(); j++) {
	if (j != classIndex) {
	  buffer.append(instance.stringValue(j)).append(" ");
	} 
      }
      // Create a document vector for this document
      String string = buffer.toString();
      HashMapVector vector = m_tokenizer.tokenize(string);
      vector.initLength();
      indexInstance(instance, i, string, vector);
    }
    // Now that all instances have been processed, we can calculate the IDF weights for
    // all tokens and the resulting lengths of all weighted document vectors.
    computeIDFandStringLengths();
    System.out.println(getTimestamp() + " Indexed " +  m_instanceRefs.size() + " documents with " + size() + " unique terms.");
    createPairSet();
    System.out.println(getTimestamp() + " Created a set with " + m_pairSet.size() + " pairs");
  }

  /** Index a given Instance using its corresponding vector */
  protected void indexInstance(Instance instance, int idx, String string, HashMapVector vector) {
    // Create a new reference
    InstanceReference instRef = new InstanceReference(instance, idx, string, vector);
    m_instanceRefs.add(instRef);
    
    m_instanceRefHash.put(instance, instRef);
    // Iterate through each of the tokens in the document
    Iterator mapEntries = vector.iterator();
    while (mapEntries.hasNext()) {
      Map.Entry entry = (Map.Entry)mapEntries.next();
      // An entry in the HashMap maps a token to a Weight
      String token = (String)entry.getKey();
      // The count for the token is in the value of the Weight
      int count = (int)((Weight)entry.getValue()).getValue();
      // Add an occurence of this token to the inverted index pointing to this document
      indexToken(token, count, instRef);
    }
  }

  /** Add a token occurrence to the index.
   * @param token The token to index.
   * @param count The number of times it occurs in the document.
   * @param instRef A reference to the Instance it occurs in.
   */
  protected void indexToken(String token, int count, InstanceReference instRef) {
    // Find this token in the index
    TokenInfo tokenInfo = (TokenInfo)m_tokenHash.get(token);
    if (tokenInfo == null) {
      // If this is a new token, create info for it to put in the hashtable
      tokenInfo = new TokenInfo();
      m_tokenHash.put(token, tokenInfo);
    }
    // Add a new occurrence for this token to its info
    tokenInfo.occList.add(new TokenInstanceOccurrence(instRef, count));
  }

  /** Compute the IDF factor for every token in the index and the length
   * of the string vector for every string referenced in the index. */
  protected void computeIDFandStringLengths() {
    // Let N be the total number of documents indexed
    double N = m_instanceRefs.size();
    // Iterate through each of the tokens in the index 
    Iterator mapEntries = m_tokenHash.entrySet().iterator();
    while (mapEntries.hasNext()) {
      // Get the token and the tokenInfo for each entry in the HashMap
      Map.Entry entry = (Map.Entry)mapEntries.next();
      String token = (String)entry.getKey();
      TokenInfo tokenInfo = (TokenInfo)entry.getValue();

      // Get the total number of strings in which this token occurs
      double numInstanceRefs = tokenInfo.occList.size(); 

      // Calculate the IDF factor for this token
      double idf = Math.log(N/numInstanceRefs);
      if (idf == 0.0) 
	// If IDF is 0, then just remove this inconsequential token from the index
	mapEntries.remove();
      else {
	tokenInfo.idf = idf;
	// In order to compute document vector lengths,  sum the
	// square of the weights (IDF * occurrence count) across
	// every token occurrence for each document.
	for(int i = 0; i < tokenInfo.occList.size(); i++) {
	  TokenInstanceOccurrence occ = (TokenInstanceOccurrence)tokenInfo.occList.get(i);
	  if (m_useIDF) { 
	    occ.instanceRef.length = occ.instanceRef.length + Math.pow(idf*occ.count, 2);
	  } else {
	    occ.instanceRef.length = occ.instanceRef.length + occ.count * occ.count;
	  }
	}
      }
    }
    // At this point, every document length should be the sum of the squares of
    // its token weights.  In order to calculate final lengths, just need to
    // set the length of every document reference to the square-root of this sum.
    for(int i = 0; i < m_instanceRefs.size(); i++) {
      InstanceReference instanceRef = (InstanceReference)m_instanceRefs.get(i);
      instanceRef.length = Math.sqrt(instanceRef.length);
    }
  }


  /** Populate m_pairSet with all the instancePairs that contain common tokens, so that
   * they can be retrieved in the order of decreasing similarity later
   */
  public void createPairSet() {
    HashSet processedPairSet = new HashSet();
    
    // Iterate through each of the tokens in the index, getting instances containing them
    Iterator mapEntries = m_tokenHash.entrySet().iterator();
    while (mapEntries.hasNext()) {
      // Get the token and the tokenInfo for each entry in the HashMap
      Map.Entry entry = (Map.Entry)mapEntries.next();
      String token = (String)entry.getKey();
      TokenInfo tokenInfo = (TokenInfo)entry.getValue();

      // Get the total number of strings in which this token occurs
      int numInstanceRefs = tokenInfo.occList.size();
      // if more than 1, compare pair and add to the index
      if (numInstanceRefs > 1) {
	for (int i = 0; i < numInstanceRefs; i++) {
	  InstanceReference instRef1 = ((TokenInstanceOccurrence) tokenInfo.occList.get(i)).instanceRef;
	  for (int j = i+1; j < numInstanceRefs; j++) {
	    InstanceReference instRef2 = ((TokenInstanceOccurrence) tokenInfo.occList.get(j)).instanceRef;
	    Integer hashValue1 = new Integer(instRef1.idx * m_instances.numInstances() + instRef2.idx);
	    Integer hashValue2 = new Integer(instRef2.idx * m_instances.numInstances() + instRef1.idx);
	    // if the similarity for this pair of instances has not been calculated before, calculate and store
	    if (!processedPairSet.contains(hashValue1)) {
	      double sim = similarity(instRef1, instRef2);
	      InstancePair pair = new InstancePair(instRef1.instance, instRef2.instance,
						   (instRef1.instance.classValue() == instRef2.instance.classValue()),
						   sim); 
	      m_pairSet.add(pair); 
	      processedPairSet.add(hashValue1);
	      processedPairSet.add(hashValue2);
	    }
	  }
	}
      }
    }
  } 


  /** Compute similarity between two strings
   * @param s1 first string
   * @param s2 second string
   * @returns similarity between two strings
   */
  public double similarity(InstanceReference iRef1, InstanceReference iRef2) {
    double length1 = iRef1.length;
    double length2 = iRef1.length;
    HashMapVector v1 = iRef1.vector;
    HashMapVector v2 = iRef2.vector;
    double similarity = 0;

    if (length1 == 0 || length2 == 0) {
      return 0;
    }

    Iterator mapEntries = v1.iterator();
    while (mapEntries.hasNext()) {
      // Get the token and the count for each token in the query
      Map.Entry entry = (Map.Entry)mapEntries.next();
      String token = (String)entry.getKey();
      if (v2.hashMap.containsKey(token)) {
	double count1 = ((Weight)entry.getValue()).getValue();
	double count2 = ((Weight)v2.hashMap.get(token)).getValue();
	TokenInfo tokenInfo = (TokenInfo) m_tokenHash.get(token);

	// add this component unless it was killed (with idf=0)
	if (tokenInfo != null) {
	  double increment = count1 * count2;
	  if (m_useIDF) {
	    increment *= tokenInfo.idf * tokenInfo.idf;
	  }
	  similarity += increment;
	}
      }
    }
    similarity /= length1 * length2;
    return similarity;
  }

  /** Return n most similar pairs
   */
  public InstancePair[] getMostSimilarPairs(int numPairs) {
    Iterator iterator = m_pairSet.iterator();
    int i = 0;
    InstancePair [] pairs = new InstancePair[numPairs]; 
    while (iterator.hasNext() && i < numPairs) {
      InstancePair pair = (InstancePair) iterator.next();
      pairs[i++] = pair;
      //      System.out.println(pair.value + "\t" + pair.positive);
    }
    return pairs; 
  } 
  

  /** Return the number of tokens indexed.
   * @return the number of tokens indexed*/
  public int size() {
    return m_tokenHash.size();
  }


  /** Set the tokenizer to use
   * @param tokenizer the tokenizer that is used
   */
  public void setTokenizer(Tokenizer tokenizer) {
    m_tokenizer = tokenizer;
  }

  /** Get the tokenizer to use
   * @return the tokenizer that is used
   */
  public Tokenizer getTokenizer() {
    return m_tokenizer;
  }

  /** Turn IDF weighting on/off
   * @param useIDF if true, all token weights will be weighted by IDF
   */
  public void setUseIDF(boolean useIDF) {
    m_useIDF = useIDF;
  } 

  /** check whether IDF weighting is on/off
   * @return if true, all token weights are weighted by IDF
   */
  public boolean getUseIDF() {
    return m_useIDF;
  } 

  /**
   * Gets the current settings of Blocking
   *
   * @return an array of strings suitable for passing to setOptions()
   */
  public String [] getOptions() {
    String [] options = new String [1];
    int current = 0;

    while (current < options.length) {
      options[current++] = "";
    }
    return options;
  }

  /**
   * Gets a string containing current date and time.
   *
   * @return a string containing the date and time.
   */
  protected static String getTimestamp() {
    return (new SimpleDateFormat("HH:mm:ss:")).format(new Date());
  }



  /**
   * Parses a given list of options. Valid options are:<p>
   */
  public void setOptions(String[] options) throws Exception {
  }

  /**
   * Returns an enumeration describing the available options.
   *
   * @return an enumeration of all the available options.
   */
  public Enumeration listOptions() {
    Vector newVector = new Vector(0);
    return newVector.elements();
  }
}