// Copyright 2003-2008. Mark Watson (markw@markwatson.com). All rights reserved. // This software is released under the LGPL (www.fsf.org) // For an alternative non-GPL license: contact the author // THIS SOFTWARE COMES WITH NO WARRANTY package nlp.com.knowledgebooks.nlp.util; import java.util.*; import java.io.*; /** * * <p/> * Copyright 2007 by Mark Watson. All rights reserved. * <p/> */ public class Tokenizer { /** * utility to tokenize an input string into an Array of Strings * @param s2 string containing words to tokenize * @return a List<String> of parsed tokens */ static public List<String> wordsToList(String s2) { return wordsToList(s2, s2.length() + 1); } /** * utility to tokenize an input string into an Array of Strings - with a maximum # of returned words * @param s2 string containing words to tokenize * @param maxR maximum number of tokens to return * @return a List<String> of parsed tokens */ static public List<String> wordsToList(String s2, int maxR) { s2 = stripControlCharacters(s2); List<String> words = new ArrayList<String>(); String x; int count = 0; try { StreamTokenizer str_tok = new StreamTokenizer(new StringReader(s2)); str_tok.whitespaceChars('"', '"'); str_tok.whitespaceChars('\'', '\''); str_tok.whitespaceChars('/', '/'); //str_tok.wordChars(':', ':'); while (str_tok.nextToken() != StreamTokenizer.TT_EOF) { String s; switch (str_tok.ttype) { case StreamTokenizer.TT_EOL: s = ""; // we will ignore this break; case StreamTokenizer.TT_WORD: s = str_tok.sval; break; case StreamTokenizer.TT_NUMBER: s = "" + (int) str_tok.nval; // .toString(); // we will ignore this break; default : s = String.valueOf((char) str_tok.ttype); } if (s.length() < 1) continue; //if (s.indexOf("-") > -1) continue; //s = s.toLowerCase(); if (s.endsWith(".")) { // first check for abreviations like "N.J.": int index = s.indexOf("."); if (index < (s.length() - 1)) { words.add(s); } else { words.add(s.substring(0, s.length() - 1)); words.add("."); } } else if (s.endsWith(",")) { x = s.substring(0, s.length() - 1); if (x.length() > 0) words.add(x); words.add(","); } else if (s.endsWith(";")) { x = s.substring(0, s.length() - 1); if (x.length() > 0) words.add(x); words.add(";"); } else if (s.endsWith("?")) { x = s.substring(0, s.length() - 1); if (x.length() > 0) words.add(x); words.add("?"); } else if (s.endsWith(":")) { x = s.substring(0, s.length() - 1); if (x.length() > 0) words.add(x); words.add(":"); } else { words.add(s); } if (++count >= maxR) break; } } catch (Exception e) { e.printStackTrace(); } return words; } static private String stripControlCharacters(String s) { StringBuffer sb = new StringBuffer(s.length() + 1); for (int i = 0; i < s.length(); i++) { char ch = s.charAt(i); if (ch > 256 || ch == '\n' || ch == '\t' || ch == '\r' || ch == 226) { sb.append(' '); continue; } //System.out.println(" ch: " + ch + " (int)ch: " + (int)ch + " Character.isISOControl(ch): " + Character.isISOControl(ch)); if ((int) ch < 129) sb.append(ch); else sb.append(' '); } return sb.toString(); } public static void main(String []args) { String text = "The ball, rolling quickly, went down the hill."; List<String> tokens = Tokenizer.wordsToList(text); System.out.println(text); for (String token : tokens) System.out.print("\""+token+"\" "); System.out.println(); } }