NGramTokenizer.java example

Explorer
wekax-master
- weka-3-6-2
- wekaUT
  - GetAllSubPackages.java
  - weka
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    NGramTokenizer.java
 *    Copyright (C) 2001 Mikhail Bilenko
 *
 */


package weka.deduping.metrics;

import java.util.*;
import java.io.*;
import weka.core.*;

/**
 * This class defines a tokenizer that turns strings into HashMapVectors 
 * of n-grams
 *
 * @author Mikhail Bilenko
 */
public class NGramTokenizer extends Tokenizer implements Serializable, OptionHandler {
  /** Converting all tokens to lowercase */
  protected boolean m_caseInsensitive = true;

  /** Stemming */
  protected boolean m_stemming = false;
  protected Porter m_stemmer = new Porter();

  /** Stopword removal */
  protected boolean m_stopwordRemoval = false;
  /** The with the stopword list */
  protected static String m_stopwordFilename = "/u/mbilenko/weka/weka/deduping/metrics/stopwords.txt";
  /** Stopword hash */
  protected static HashSet m_stopwordSet = null;

  /** Length of an n-gram */
  protected int m_n = 3;

  /** A default set of space-equivalent characters */
  protected String m_spaceEquivalents = "\t\n\r\f\'\"\\!@#$%^&*()_-+={}<>,.;:|[]{}/*~`";
  protected char[] m_spaceChars = null;

  /** if true, all space equivalents will be replaced with a single space */ 
  protected boolean m_replaceSpaces = false; 


  /** A default constructor */
  public NGramTokenizer() {
    super();
    m_spaceChars = m_spaceEquivalents.toCharArray();
    setStemming(false);
    setStopwordRemoval(false);
  } 

  /** Take a string and create a vector of n-gram tokens from it
   * @param string a String to tokenize
   * @returns vector with individual tokens
   */
  public HashMapVector tokenize(String string) {
    if (m_caseInsensitive) {
      string = string.toLowerCase();
    }
    
    StringBuffer filteredString = new StringBuffer();

    // only need to tokenize if stemming, or removing stopwords, or replacing space equivalents
    if (m_stemming || m_stopwordRemoval || m_replaceSpaces) {
      StringTokenizer tokenizer = new StringTokenizer(string, m_spaceEquivalents, true);
      while (tokenizer.hasMoreTokens()) {
	String token = tokenizer.nextToken();

	if (m_stemming) {
	  token = stem(token);
	}
	if (m_stopwordRemoval && m_stopwordSet.contains(token)) {
	  continue;
	}

	if (m_replaceSpaces && token.length() == 1) {
	  if (m_spaceEquivalents.indexOf(token) > -1) {
	    filteredString.append(" ");
	  }
	} else {
	  filteredString.append(token);
	}
      }
    } else {
      filteredString = new StringBuffer(string);
    } 
    
    char[] chars = filteredString.toString().toCharArray();
   
    HashMapVector result = new HashMapVector();

    for (int i = 0; i < chars.length - m_n; i++) {
      String token = new String(chars, i, m_n);
      result.increment(token);
    } 
    return result;
  }

  /** Take a string and create a TokenString of overlapping n-gram tokens from it
   * @param string a String to tokenize
   * @returns vector with individual tokens
   */
  public TokenString getTokenString(String string) {
    TokenString ts = new TokenString(string);
    ArrayList tokenList = new ArrayList();
    
    StringBuffer filteredString = new StringBuffer();
    // only need to tokenize if stemming, or removing stopwords, or replacing space equivalents
    if (m_stemming || m_stopwordRemoval || m_replaceSpaces) {
      StringTokenizer tokenizer = new StringTokenizer(string, m_spaceEquivalents, true);
      while (tokenizer.hasMoreTokens()) {
	String token = tokenizer.nextToken();

	if (m_stemming) {
	  token = stem(token);
	}
	if (m_stopwordRemoval && m_stopwordSet.contains(token)) {
	  continue;
	}

	if (m_replaceSpaces && token.length() == 1) {
	  if (m_spaceEquivalents.indexOf(token) > -1) {
	    filteredString.append(" ");
	  }
	} else {
	  filteredString.append(token);
	}
      }
    } else {
      filteredString = new StringBuffer(string);
    } 
    
    char[] chars = filteredString.toString().toCharArray();
   
    HashMapVector result = new HashMapVector();

    for (int i = 0; i < chars.length - m_n; i++) {
      String token = new String(chars, i, m_n);
      Object o = m_stringIDmap.get(token);
      if (o == null) {
	m_stringIDmap.put(token, new Integer(m_currIDidx++));
      }
      tokenList.add(token);
    }

    // convert the tokenList into the two arrays inside TokenString
    ts.tokens = new String[tokenList.size()];
    ts.tokens = (String[]) tokenList.toArray(ts.tokens);
    ts.tokenIDs = new int[ts.tokens.length];
    for (int i = 0; i < ts.tokens.length; i++) {
      ts.tokenIDs[i] = ((Integer)m_stringIDmap.get(ts.tokens[i])).intValue();
    } 
    return ts;
  }


  /** Set the gram length
   * @param n the gram length
   */
  public void setN(int n) {
    m_n = n;
  }

  /** Get the gram length
   * @return the gram length
   */
  public int getN() {
    return m_n;
  }

  
  /** Specify which characters should be treated as spaces
   * @param spaceEquivalents a string containing space equivalents
   */
  public void setSpaceEquivalents(String spaceEquivalents) {
    m_spaceEquivalents = new String(spaceEquivalents);
    m_spaceChars = m_spaceEquivalents.toCharArray();
  } 

  /** Get the haracters that should be treated as spaces 
   * @return a string containing space equivalents
   */
  public String getSpaceEquivalents() {
    return m_spaceEquivalents;
  }


  /** Turn on/off replacing space equivalents with a single space */
  public void setReplaceSpaces(boolean replaceSpaces) {
    m_replaceSpaces = replaceSpaces;
  }
  public boolean getReplaceSpaces() {
    return m_replaceSpaces;
  } 

  
  /**
   * Gets the current settings of NGramTokenizer.
   *
   * @return an array of strings suitable for passing to setOptions()
   */
  public String [] getOptions() {
    String [] options = new String [10];
    int current = 0;

    if (m_stemming) {
      options[current++] = "-S";
    }

    if (m_stopwordRemoval) {
      options[current++] = "-R";
    }

    if (m_replaceSpaces) {
      options[current++] = "-spaces";
    }
    
    options[current++] = "-N";
    options[current++] = "" + m_n;
    
    while (current < options.length) {
      options[current++] = "";
    }
    return options;
  }


  /**
   * Parses a given list of options. Valid options are:<p>
   *
   * -S use stemming
   * -R remove stopwords
   * -N gram size
   */
  public void setOptions(String[] options) throws Exception {
    setStemming(Utils.getFlag('S', options));

    setStopwordRemoval(Utils.getFlag('R', options));
    
    
    String nString = Utils.getOption('N', options);
    if (nString.length() != 0) {
      setN(Integer.parseInt(nString));
    }
  }

  /**
   * Returns an enumeration describing the available options.
   *
   * @return an enumeration of all the available options.
   */
  public Enumeration listOptions() {
    Vector newVector = new Vector(5);

    newVector.addElement(new Option("\tUse Porter stemmer for stemming\n",
				    "S", 0, "-S"));

    newVector.addElement(new Option("\tRemove stopwords\n",
				    "R", 0, "-R"));
    
    newVector.addElement(new Option("\tGram size\n",
				    "N", 1, "-N"));
    
    return newVector.elements();
  }

    /** Turn case sensitivity on/off
   * @param caseInsensitive if true, the tokenizer is case-insensitive
   */
  public void setCaseInsensitive(boolean caseInsensitive) { 
    m_caseInsensitive = caseInsensitive;
  }

  /** Turn case sensitivity on/off
   * @return if true, the tokenizer is case-insensitive
   */
  public boolean getCaseInsensitive() { 
    return m_caseInsensitive;
  }

   /** Turn stemming on/off
   * @param stemming if true, stemming is used
   */
  public void setStemming(boolean stemming) { 
    m_stemming = stemming;
    if (stemming) {
      m_stemmer = new Porter();
    }
  }

  /** Find out whether stemming is on/off
   * @return if true, stemming is used
   */
  public boolean getStemming() { 
    return m_stemming;
  }

  /** Stem a given token
   * @param token the token to be stemmed
   * @return a new token resulting from applying the stemmer
   */
  public String stem(String token) {
    return m_stemmer.stripAffixes(token);
  }
  
  /** Turn stopword removal on/off and load the stopwords
   * @param stopwordRemoval if true, stopwords from m_stopwordFile will be removed
   */
  public void setStopwordRemoval(boolean stopwordRemoval) { 
    m_stopwordRemoval = stopwordRemoval;
    if (m_stopwordRemoval) {
      try {
	BufferedReader in = new BufferedReader(new FileReader(m_stopwordFilename));
	m_stopwordSet = new HashSet();
	String stopword;
	while ((stopword = in.readLine()) != null) {
	  m_stopwordSet.add(stopword);
	}
      } catch (Exception e) {
	System.out.println("Problems initializing the stopwords from " + m_stopwordFilename);
      }
    }
  }

  /** Get whether stopword removal is on or off
   * @return true if stopword removal is on
   */
  public boolean getStopwordRemoval() {
    return m_stopwordRemoval;
  }
}