/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * NGramTokenizer.java * Copyright (C) 2001 Mikhail Bilenko * */ package weka.deduping.metrics; import java.util.*; import java.io.*; import weka.core.*; /** * This class defines a tokenizer that turns strings into HashMapVectors * of n-grams * * @author Mikhail Bilenko */ public class NGramTokenizer extends Tokenizer implements Serializable, OptionHandler { /** Converting all tokens to lowercase */ protected boolean m_caseInsensitive = true; /** Stemming */ protected boolean m_stemming = false; protected Porter m_stemmer = new Porter(); /** Stopword removal */ protected boolean m_stopwordRemoval = false; /** The with the stopword list */ protected static String m_stopwordFilename = "/u/mbilenko/weka/weka/deduping/metrics/stopwords.txt"; /** Stopword hash */ protected static HashSet m_stopwordSet = null; /** Length of an n-gram */ protected int m_n = 3; /** A default set of space-equivalent characters */ protected String m_spaceEquivalents = "\t\n\r\f\'\"\\!@#$%^&*()_-+={}<>,.;:|[]{}/*~`"; protected char[] m_spaceChars = null; /** if true, all space equivalents will be replaced with a single space */ protected boolean m_replaceSpaces = false; /** A default constructor */ public NGramTokenizer() { super(); m_spaceChars = m_spaceEquivalents.toCharArray(); setStemming(false); setStopwordRemoval(false); } /** Take a string and create a vector of n-gram tokens from it * @param string a String to tokenize * @returns vector with individual tokens */ public HashMapVector tokenize(String string) { if (m_caseInsensitive) { string = string.toLowerCase(); } StringBuffer filteredString = new StringBuffer(); // only need to tokenize if stemming, or removing stopwords, or replacing space equivalents if (m_stemming || m_stopwordRemoval || m_replaceSpaces) { StringTokenizer tokenizer = new StringTokenizer(string, m_spaceEquivalents, true); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if (m_stemming) { token = stem(token); } if (m_stopwordRemoval && m_stopwordSet.contains(token)) { continue; } if (m_replaceSpaces && token.length() == 1) { if (m_spaceEquivalents.indexOf(token) > -1) { filteredString.append(" "); } } else { filteredString.append(token); } } } else { filteredString = new StringBuffer(string); } char[] chars = filteredString.toString().toCharArray(); HashMapVector result = new HashMapVector(); for (int i = 0; i < chars.length - m_n; i++) { String token = new String(chars, i, m_n); result.increment(token); } return result; } /** Take a string and create a TokenString of overlapping n-gram tokens from it * @param string a String to tokenize * @returns vector with individual tokens */ public TokenString getTokenString(String string) { TokenString ts = new TokenString(string); ArrayList tokenList = new ArrayList(); StringBuffer filteredString = new StringBuffer(); // only need to tokenize if stemming, or removing stopwords, or replacing space equivalents if (m_stemming || m_stopwordRemoval || m_replaceSpaces) { StringTokenizer tokenizer = new StringTokenizer(string, m_spaceEquivalents, true); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if (m_stemming) { token = stem(token); } if (m_stopwordRemoval && m_stopwordSet.contains(token)) { continue; } if (m_replaceSpaces && token.length() == 1) { if (m_spaceEquivalents.indexOf(token) > -1) { filteredString.append(" "); } } else { filteredString.append(token); } } } else { filteredString = new StringBuffer(string); } char[] chars = filteredString.toString().toCharArray(); HashMapVector result = new HashMapVector(); for (int i = 0; i < chars.length - m_n; i++) { String token = new String(chars, i, m_n); Object o = m_stringIDmap.get(token); if (o == null) { m_stringIDmap.put(token, new Integer(m_currIDidx++)); } tokenList.add(token); } // convert the tokenList into the two arrays inside TokenString ts.tokens = new String[tokenList.size()]; ts.tokens = (String[]) tokenList.toArray(ts.tokens); ts.tokenIDs = new int[ts.tokens.length]; for (int i = 0; i < ts.tokens.length; i++) { ts.tokenIDs[i] = ((Integer)m_stringIDmap.get(ts.tokens[i])).intValue(); } return ts; } /** Set the gram length * @param n the gram length */ public void setN(int n) { m_n = n; } /** Get the gram length * @return the gram length */ public int getN() { return m_n; } /** Specify which characters should be treated as spaces * @param spaceEquivalents a string containing space equivalents */ public void setSpaceEquivalents(String spaceEquivalents) { m_spaceEquivalents = new String(spaceEquivalents); m_spaceChars = m_spaceEquivalents.toCharArray(); } /** Get the haracters that should be treated as spaces * @return a string containing space equivalents */ public String getSpaceEquivalents() { return m_spaceEquivalents; } /** Turn on/off replacing space equivalents with a single space */ public void setReplaceSpaces(boolean replaceSpaces) { m_replaceSpaces = replaceSpaces; } public boolean getReplaceSpaces() { return m_replaceSpaces; } /** * Gets the current settings of NGramTokenizer. * * @return an array of strings suitable for passing to setOptions() */ public String [] getOptions() { String [] options = new String [10]; int current = 0; if (m_stemming) { options[current++] = "-S"; } if (m_stopwordRemoval) { options[current++] = "-R"; } if (m_replaceSpaces) { options[current++] = "-spaces"; } options[current++] = "-N"; options[current++] = "" + m_n; while (current < options.length) { options[current++] = ""; } return options; } /** * Parses a given list of options. Valid options are:<p> * * -S use stemming * -R remove stopwords * -N gram size */ public void setOptions(String[] options) throws Exception { setStemming(Utils.getFlag('S', options)); setStopwordRemoval(Utils.getFlag('R', options)); String nString = Utils.getOption('N', options); if (nString.length() != 0) { setN(Integer.parseInt(nString)); } } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector newVector = new Vector(5); newVector.addElement(new Option("\tUse Porter stemmer for stemming\n", "S", 0, "-S")); newVector.addElement(new Option("\tRemove stopwords\n", "R", 0, "-R")); newVector.addElement(new Option("\tGram size\n", "N", 1, "-N")); return newVector.elements(); } /** Turn case sensitivity on/off * @param caseInsensitive if true, the tokenizer is case-insensitive */ public void setCaseInsensitive(boolean caseInsensitive) { m_caseInsensitive = caseInsensitive; } /** Turn case sensitivity on/off * @return if true, the tokenizer is case-insensitive */ public boolean getCaseInsensitive() { return m_caseInsensitive; } /** Turn stemming on/off * @param stemming if true, stemming is used */ public void setStemming(boolean stemming) { m_stemming = stemming; if (stemming) { m_stemmer = new Porter(); } } /** Find out whether stemming is on/off * @return if true, stemming is used */ public boolean getStemming() { return m_stemming; } /** Stem a given token * @param token the token to be stemmed * @return a new token resulting from applying the stemmer */ public String stem(String token) { return m_stemmer.stripAffixes(token); } /** Turn stopword removal on/off and load the stopwords * @param stopwordRemoval if true, stopwords from m_stopwordFile will be removed */ public void setStopwordRemoval(boolean stopwordRemoval) { m_stopwordRemoval = stopwordRemoval; if (m_stopwordRemoval) { try { BufferedReader in = new BufferedReader(new FileReader(m_stopwordFilename)); m_stopwordSet = new HashSet(); String stopword; while ((stopword = in.readLine()) != null) { m_stopwordSet.add(stopword); } } catch (Exception e) { System.out.println("Problems initializing the stopwords from " + m_stopwordFilename); } } } /** Get whether stopword removal is on or off * @return true if stopword removal is on */ public boolean getStopwordRemoval() { return m_stopwordRemoval; } }