StringToWordVector.java example

Explorer
wekax-master
- weka-3-6-2
- wekaUT
  - GetAllSubPackages.java
  - weka
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    StringToWordVector.java
 *    Copyright (C) 2002 University of Waikato
 *
 *    Updated 12/Dec/2001 by Gordon Paynter (gordon.paynter@ucr.edu)
 *                        Added parameters for delimiter set,
 *                        number of words to add, and input range.
 */


package weka.filters.unsupervised.attribute;

import java.io.Serializable;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.Random;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.Vector;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.SparseInstance;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.UnsupervisedFilter;


/** 
 * Converts String attributes into a set of attributes representing word
 * occurrence information from the text contained in the strings. The set of
 * words (attributes) is determined by the first batch filtered (typically
 * training data).
 *
 * @author Len Trigg (len@reeltwo.com)
 * @author Stuart Inglis (stuart@reeltwo.com)
 * @version $Revision: 1.1.1.1 $ 
 */
public class StringToWordVector extends Filter
  implements UnsupervisedFilter, OptionHandler {

  /** Delimiters used in tokenization */
  private String delimiters = " \n\t.,:'\"()?!";

  /** Range of columns to convert to word vectors */
  protected Range m_SelectedRange = null;

  /** Contains a mapping of valid words to attribute indexes */
  private TreeMap m_Dictionary = new TreeMap();

  /** True if the first batch has been done */
  private boolean m_FirstBatchDone = false;

  /** True if output instances should contain word frequency rather than boolean 0 or 1. */
  private boolean m_OutputCounts = false;

  /**
   * The default number of words (per class if there is a class attribute
   * assigned) to attempt to keep.
   */
  private int m_WordsToKeep = 1000;

  /**
   * Returns an enumeration describing the available options
   *
   * @return an enumeration of all the available options
   */
  public Enumeration listOptions() {

    Vector newVector = new Vector(3);

    newVector.addElement(new Option(
				    "\tOutput word counts rather than boolean word presence.\n",
				    "C", 0, "-C"));
    newVector.addElement(new Option(
				    "\tString containing the set of delimiter characters\n"
				    + "\t(default: \" \\n\\t.,:'\\\"()?!\")",
				    "D", 1, "-D <delimiter set>"));
    newVector.addElement(new Option(
				    "\tSpecify list of string attributes to convert to words (as weka Range).\n"
				    + "\t(default: select all string attributes)",
				    "R", 1, "-R <index1,index2-index4,...>"));
    newVector.addElement(new Option(
				    "\tSpecify approximate number of word fields to create.\n"
				    + "\tSurplus words will be discarded..\n"
				    + "\t(default: 1000)",
				    "W", 1, "-W <number of words to keep>"));

    return newVector.elements();
  }

  /**
   * Parses a given list of options controlling the behaviour of this object.
   * Valid options are:<p>
   *
   * -C<br>
   * Output word counts rather than boolean word presence.<p>
   * 
   * -D delimiter_charcters <br>
   * Specify set of delimiter characters
   * (default: " \n\t.,:'\\\"()?!\"<p>
   *
   * -R index1,index2-index4,...<br>
   * Specify list of string attributes to convert to words.
   * (default: all string attributes)<p>
   *
   * -W number_of_words_to_keep <br>
   * Specify number of word fields to create.
   * Other, less useful words will be discarded.
   * (default: 1000)<p>
   *
   * @param options the list of options as an array of strings
   * @exception Exception if an option is not supported
   */
  public void setOptions(String[] options) throws Exception {

    String value = Utils.getOption('D', options);
    if (value.length() != 0) {
      setDelimiters(value);
    }

    value = Utils.getOption('R', options);
    if (value.length() != 0) {
      setSelectedRange(value);
    }

    value = Utils.getOption('W', options);
    if (value.length() != 0) {
      setWordsToKeep(Integer.valueOf(value).intValue());
    }

    setOutputWordCounts(Utils.getFlag('C', options));
  }

  /**
   * Gets the current settings of the filter.
   *
   * @return an array of strings suitable for passing to setOptions
   */
  public String [] getOptions() {

    String [] options = new String [11];
    int current = 0;

    options[current++] = "-D"; 
    options[current++] = getDelimiters();

    if (getSelectedRange() != null) {
      options[current++] = "-R"; 
      m_SelectedRange.setUpper(getInputFormat().numAttributes() - 1);
      options[current++] = getSelectedRange().getRanges();
    }

    options[current++] = "-W"; 
    options[current++] = String.valueOf(getWordsToKeep());

    if (getOutputWordCounts()) {
      options[current++] = "-C";
    }

    while (current < options.length) {
      options[current++] = "";
    }
    return options;
  }

  /**
   * Default constructor. Targets 1000 words in the output.
   */
  public StringToWordVector() {
  }

  /**
   * Constructor that allows specification of the target number of words
   * in the output.
   *
   * @param wordsToKeep the number of words in the output vector (per class
   * if assigned).
   */
  public StringToWordVector(int wordsToKeep) {
    m_WordsToKeep = wordsToKeep;
  }
  
  /** 
   * Used to store word counts for dictionary selection based on 
   * a threshold.
   */
  private class Count implements Serializable {
    public int count;
    public Count(int c) { count = c; }
  }

  /**
   * Sets the format of the input instances.
   *
   * @param instanceInfo an Instances object containing the input 
   * instance structure (any instances contained in the object are 
   * ignored - only the structure is required).
   * @return true if the outputFormat may be collected immediately
   * @exception Exception if the input format can't be set 
   * successfully
   */
  public boolean setInputFormat(Instances instanceInfo) 
    throws Exception {

    super.setInputFormat(instanceInfo);
    m_FirstBatchDone = false;
    return false;
  }

  /**
   * Input an instance for filtering. Filter requires all
   * training instances be read before producing output.
   *
   * @param instance the input instance.
   * @return true if the filtered instance may now be
   * collected with output().
   * @exception IllegalStateException if no input structure has been defined.
   */
  public boolean input(Instance instance) {

    if (getInputFormat() == null) {
      throw new IllegalStateException("No input instance format defined");
    }
    if (m_NewBatch) {
      resetQueue();
      m_NewBatch = false;
    }
    if (m_FirstBatchDone) {
      convertInstance(instance);
      return true;
    } else {
      bufferInput(instance);
      return false;
    }
  }

  /**
   * Signify that this batch of input to the filter is finished. 
   * If the filter requires all instances prior to filtering,
   * output() may now be called to retrieve the filtered instances.
   *
   * @return true if there are instances pending output.
   * @exception IllegalStateException if no input structure has been defined.
   */
  public boolean batchFinished() {

    if (getInputFormat() == null) {
      throw new IllegalStateException("No input instance format defined");
    }

    // Determine the dictionary
    if (!m_FirstBatchDone) {
      determineDictionary();
    }

    // Convert pending input instances.
    for(int i = 0; i < getInputFormat().numInstances(); i++) {
      convertInstance(getInputFormat().instance(i));
    }
    flushInput();

    m_NewBatch = true;
    m_FirstBatchDone = true;
    return (numPendingOutput() != 0);
  }

  /**
   * Gets whether output instances contain 0 or 1 indicating word
   * presence, or word counts.
   *
   * @return true if word counts should be output.
   */
  public boolean getOutputWordCounts() {
    return m_OutputCounts;
  }

  /**
   * Sets whether output instances contain 0 or 1 indicating word
   * presence, or word counts.
   *
   * @param outputWordCounts true if word counts should be output.
   */
  public void setOutputWordCounts(boolean outputWordCounts) {
    m_OutputCounts = outputWordCounts;
  }

  /**
   * Get the value of delimiters.
   *
   * @return Value of delimiters.
   */
  public String getDelimiters() {
    return delimiters;
  }
    
  /**
   * Set the value of delimiters.
   *
   * @param newdelimiters Value to assign to delimiters.
   */
  public void setDelimiters(String newDelimiters) {
    delimiters = newDelimiters;
  }

  /**
   * Get the value of m_SelectedRange.
   *
   * @return Value of m_SelectedRange.
   */
  public Range getSelectedRange() {
    return m_SelectedRange;
  }
    
  /**
   * Set the value of m_SelectedRange.
   *
   * @param newSelectedRange Value to assign to m_SelectedRange.
   */
  public void setSelectedRange(String newSelectedRange) {
    m_SelectedRange = new Range(newSelectedRange);
  }

  /**
   * Gets the number of words (per class if there is a class attribute
   * assigned) to attempt to keep.
   *
   * @return the target number of words in the output vector (per class if
   * assigned).
   */
  public int getWordsToKeep() {

    return m_WordsToKeep;
  }
  
  /**
   * Sets the number of words (per class if there is a class attribute
   * assigned) to attempt to keep.
   *
   * @param newWordsToKeep the target number of words in the output 
   * vector (per class if assigned).
   */
  public void setWordsToKeep(int newWordsToKeep) {

    m_WordsToKeep = newWordsToKeep;
  }
  

  private static void sortArray(int [] array) {
      
    int i, j, h, N = array.length - 1;
	
    for (h = 1; h <= N / 9; h = 3 * h + 1); 
	
    for (; h > 0; h /= 3) {
      for (i = h + 1; i <= N; i++) { 
        int v = array[i]; 
        j = i; 
        while (j > h && array[j - h] > v ) { 
          array[j] = array[j - h]; 
          j -= h; 
        } 
        array[j] = v; 
      } 
    }
  }

  private void determineSelectedRange() {
    
    Instances inputFormat = getInputFormat();
    
    // Calculate the default set of fields to convert
    if (m_SelectedRange == null) {
      StringBuffer fields = new StringBuffer();
      for (int j = 0; j < inputFormat.numAttributes(); j++) { 
	if (inputFormat.attribute(j).type() == Attribute.STRING)
	  fields.append((j + 1) + ",");
      }
      m_SelectedRange = new Range(fields.toString());
    }
    
    m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);
    
    // Prevent the user from converting non-string fields
    StringBuffer fields = new StringBuffer();
    for (int j = 0; j < inputFormat.numAttributes(); j++) { 
      if (m_SelectedRange.isInRange(j) 
	  && inputFormat.attribute(j).type() == Attribute.STRING)
	fields.append((j + 1) + ",");
    }
    m_SelectedRange.setRanges(fields.toString());
    
    // System.err.println("Selected Range: " + getSelectedRange().getRanges()); 
  }
  
  private void determineDictionary() {
    
    // System.err.println("Creating dictionary"); 
    
    int classInd = getInputFormat().classIndex();
    int values = 1;
    if (classInd != -1) {
      values = getInputFormat().attribute(classInd).numValues();
    }
    TreeMap dictionaryArr [] = new TreeMap[values];
    for (int i = 0; i < values; i++) {
      dictionaryArr[i] = new TreeMap();
    }

    // Make sure we know which fields to convert
    determineSelectedRange();

    // Tokenize all training text into an orderedMap of "words".
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
      /*
	if (i % 10 == 0) {
        System.err.print( i + " " + getInputFormat().numInstances() + "\r"); 
        System.err.flush();
	}
      */
      Instance instance = getInputFormat().instance(i);
      for (int j = 0; j < instance.numAttributes(); j++) { 
        if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {
	  //getInputFormat().attribute(j).type() == Attribute.STRING 
            
          StringTokenizer st = new StringTokenizer(instance.stringValue(j),
                                                   delimiters);
          while (st.hasMoreTokens()) {
            String word = st.nextToken().intern();
            int vInd = 0;
            if (classInd != -1) {
              vInd = (int)instance.classValue();
            }
            Count count = (Count)dictionaryArr[vInd].get(word);
            if (count == null) {
              dictionaryArr[vInd].put(word, new Count(1));
            } else {
              count.count ++;
            }
          }
        }
      }
    }


    int totalsize = 0;
    int prune[] = new int[values];
    for (int z = 0; z < values; z++) {
      totalsize += dictionaryArr[z].size();

      int array[] = new int[dictionaryArr[z].size()];
      int pos = 0;
      Iterator it = dictionaryArr[z].keySet().iterator();
      while (it.hasNext()) {
        String word = (String)it.next();
        Count count = (Count)dictionaryArr[z].get(word);
        array[pos] = count.count;
        pos++;
      }

      // sort the array
      sortArray(array);
      if (array.length < m_WordsToKeep) {
        // if there aren't enough words, set the threshold to 1
        prune[z] = 1;
      } else {
        // otherwise set it to be at least 1
        prune[z] = Math.max(1, array[array.length - m_WordsToKeep]);
      }

    }

    /*
      for (int z=0;z<values;z++) {
      System.err.println(dictionaryArr[z].size()+" "+totalsize);
      }
    */

    // Convert the dictionary into an attribute index
    // and create one attribute per word
    FastVector attributes = new FastVector(totalsize +
					   getInputFormat().numAttributes());

    // Add the non-converted attributes 
    int classIndex = -1;
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
      if (!m_SelectedRange.isInRange(i)) { 
        if (getInputFormat().classIndex() == i) {
          classIndex = attributes.size();
        }
	attributes.addElement(getInputFormat().attribute(i).copy());
      }     
    }

    // Add the word vector attributes
    TreeMap newDictionary = new TreeMap();

    int index = attributes.size();
    for(int z = 0; z < values; z++) {
      /*
	System.err.print("\nCreating word index...");
	if (values > 1) {
        System.err.print(" for class id=" + z); 
	}
	System.err.flush();
      */
      Iterator it = dictionaryArr[z].keySet().iterator();
      while (it.hasNext()) {
        String word = (String)it.next();
        Count count = (Count)dictionaryArr[z].get(word);
        if (count.count >= prune[z]) {
          //          System.err.println(word+" "+newDictionary.get(word));
          if(newDictionary.get(word) == null) {
            /*
	      if (values > 1) {
              System.err.print(getInputFormat().classAttribute().value(z) + " ");
	      }
	      System.err.println(word);
            */
            newDictionary.put(word, new Integer(index++));
            attributes.addElement(new Attribute(word));
          }
        }
      }
    }

    attributes.trimToSize();
    m_Dictionary = newDictionary;

    //System.err.println("done: " + index + " words in total.");

    
    // Set the filter's output format
    Instances outputFormat = new Instances(getInputFormat().relationName(), 
                                           attributes, 0);
    outputFormat.setClassIndex(classIndex);
    setOutputFormat(outputFormat);
  }


  private void convertInstance(Instance instance) {

    // Convert the instance into a sorted set of indexes
    TreeMap contained = new TreeMap();

    // Copy all non-converted attributes from input to output
    int firstCopy = 0;
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
      if (!m_SelectedRange.isInRange(i)) { 
      
	if (getInputFormat().attribute(i).type() != Attribute.STRING) {
	  // Add simple nominal and numeric attributes directly
	  if (instance.value(i) != 0.0) {
	    contained.put(new Integer(firstCopy), 
			  new Double(instance.value(i)));
	  } 
	} else {
	  if (instance.isMissing(i)) {
	    contained.put(new Integer(firstCopy),
			  new Double(Instance.missingValue()));
	  } else {

	    // If this is a string attribute, we have to first add
	    // this value to the range of possible values, then add
	    // its new internal index.
	    if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
	      // Note that the first string value in a
	      // SparseInstance doesn't get printed.
	      outputFormatPeek().attribute(firstCopy)
		.addStringValue("Hack to defeat SparseInstance bug");
	    }
	    int newIndex = outputFormatPeek().attribute(firstCopy)
	      .addStringValue(instance.stringValue(i));
	    contained.put(new Integer(firstCopy), 
			  new Double(newIndex));
	  }
	}
	firstCopy++;
      }     
    }
    for (int j = 0; j < instance.numAttributes(); j++) { 
      //if ((getInputFormat().attribute(j).type() == Attribute.STRING) 
      if (m_SelectedRange.isInRange(j)
	  && (instance.isMissing(j) == false)) {          
        StringTokenizer st = new StringTokenizer(instance.stringValue(j),
                                                 delimiters);
        while (st.hasMoreTokens()) {
          String word = st.nextToken();
          Integer index = (Integer) m_Dictionary.get(word);
          if (index != null) {
            if (m_OutputCounts) { // Separate if here rather than two lines down to avoid hashtable lookup
              Double count = (Double)contained.get(index);
              if (count != null) {
                contained.put(index, new Double(count.doubleValue() + 1.0));
              } else {
                contained.put(index, new Double(1));
              }
            } else {
              contained.put(index, new Double(1));
            }
          }
        }
      }
    }
    
    // Convert the set to structures needed to create a sparse instance.
    double [] values = new double [contained.size()];
    int [] indices = new int [contained.size()];
    Iterator it = contained.keySet().iterator();
    for (int i = 0; it.hasNext(); i++) {
      Integer index = (Integer)it.next();
      Double value = (Double)contained.get(index);
      values[i] = value.doubleValue();
      indices[i] = index.intValue();
    }

    Instance inst = new SparseInstance(instance.weight(), values, indices, 
                                       outputFormatPeek().numAttributes());
    inst.setDataset(outputFormatPeek());
    push(inst);
    //System.err.print("#"); System.err.flush();
  }

  /**
   * Main method for testing this class.
   *
   * @param argv should contain arguments to the filter: 
   * use -h for help
   */
  public static void main(String [] argv) {

    try {
      if (Utils.getFlag('b', argv)) {
 	Filter.batchFilterFile(new StringToWordVector(), argv);
      } else {
	Filter.filterFile(new StringToWordVector(), argv);
      }
    } catch (Exception ex) {
      ex.printStackTrace();
      System.out.println(ex.getMessage());
    }
  }
}