/**
* Portions Copyright 2001 Sun Microsystems, Inc.
* Portions Copyright 1999-2001 Language Technologies Institute,
* Carnegie Mellon University.
* All Rights Reserved. Use is subject to license terms.
*
* See the file "license.terms" for information on usage and
* redistribution of this file, and for a DISCLAIMER OF ALL
* WARRANTIES.
*/
package edu.cmu.sphinx.alignment.tokenizer;
import java.io.IOException;
import java.io.Reader;
import java.util.Iterator;
import edu.cmu.sphinx.alignment.Token;
/**
* Implements the tokenizer interface. Breaks an input sequence of characters
* into a set of tokens.
*/
public class CharTokenizer implements Iterator<Token> {
/** A constant indicating that the end of the stream has been read. */
public static final int EOF = -1;
/** A string containing the default whitespace characters. */
public static final String DEFAULT_WHITESPACE_SYMBOLS = " \t\n\r";
/** A string containing the default single characters. */
public static final String DEFAULT_SINGLE_CHAR_SYMBOLS = "(){}[]";
/** A string containing the default pre-punctuation characters. */
public static final String DEFAULT_PREPUNCTUATION_SYMBOLS = "\"'`({[";
/** A string containing the default post-punctuation characters. */
public static final String DEFAULT_POSTPUNCTUATION_SYMBOLS =
"\"'`.,:;!?(){}[]";
/** The line number. */
private int lineNumber;
/** The input text (from the Utterance) to tokenize. */
private String inputText;
/** The file to read input text from, if using file mode. */
private Reader reader;
/** The current character, whether its from the file or the input text. */
private int currentChar;
/**
* The current char position for the input text (not the file) this is
* called "file_pos" in flite
*/
private int currentPosition;
/** The delimiting symbols of this tokenizer. */
private String whitespaceSymbols = DEFAULT_WHITESPACE_SYMBOLS;
private String singleCharSymbols = DEFAULT_SINGLE_CHAR_SYMBOLS;
private String prepunctuationSymbols = DEFAULT_PREPUNCTUATION_SYMBOLS;
private String postpunctuationSymbols = DEFAULT_POSTPUNCTUATION_SYMBOLS;
/** The error description. */
private String errorDescription;
/** A place to store the current token. */
private Token token;
private Token lastToken;
/**
* Constructs a Tokenizer.
*/
public CharTokenizer() {}
/**
* Creates a tokenizer that will return tokens from the given string.
*
* @param string the string to tokenize
*/
public CharTokenizer(String string) {
setInputText(string);
}
/**
* Creates a tokenizer that will return tokens from the given file.
*
* @param file where to read the input from
*/
public CharTokenizer(Reader file) {
setInputReader(file);
}
/**
* Sets the whitespace symbols of this Tokenizer to the given symbols.
*
* @param symbols the whitespace symbols
*/
public void setWhitespaceSymbols(String symbols) {
whitespaceSymbols = symbols;
}
/**
* Sets the single character symbols of this Tokenizer to the given
* symbols.
*
* @param symbols the single character symbols
*/
public void setSingleCharSymbols(String symbols) {
singleCharSymbols = symbols;
}
/**
* Sets the prepunctuation symbols of this Tokenizer to the given symbols.
*
* @param symbols the prepunctuation symbols
*/
public void setPrepunctuationSymbols(String symbols) {
prepunctuationSymbols = symbols;
}
/**
* Sets the postpunctuation symbols of this Tokenizer to the given symbols.
*
* @param symbols the postpunctuation symbols
*/
public void setPostpunctuationSymbols(String symbols) {
postpunctuationSymbols = symbols;
}
/**
* Sets the text to tokenize.
*
* @param inputString the string to tokenize
*/
public void setInputText(String inputString) {
inputText = inputString;
currentPosition = 0;
if (inputText != null) {
getNextChar();
}
}
/**
* Sets the input reader
*
* @param reader the input source
*/
public void setInputReader(Reader reader) {
this.reader = reader;
getNextChar();
}
/**
* Returns the next token.
*
* @return the next token if it exists, <code>null</code> if no more tokens
*/
public Token next() {
lastToken = token;
token = new Token();
// Skip whitespace
token.setWhitespace(getTokenOfCharClass(whitespaceSymbols));
// quoted strings currently ignored
// get prepunctuation
token.setPrepunctuation(getTokenOfCharClass(prepunctuationSymbols));
// get the symbol itself
if (singleCharSymbols.indexOf(currentChar) != -1) {
token.setWord(String.valueOf((char) currentChar));
getNextChar();
} else {
token.setWord(getTokenNotOfCharClass(whitespaceSymbols));
}
token.setPosition(currentPosition);
token.setLineNumber(lineNumber);
// This'll have token *plus* postpunctuation
// Get postpunctuation
removeTokenPostpunctuation();
return token;
}
/**
* Returns <code>true</code> if there are more tokens, <code>false</code>
* otherwise.
*
* @return <code>true</code> if there are more tokens <code>false</code>
* otherwise
*/
public boolean hasNext() {
int nextChar = currentChar;
return (nextChar != EOF);
}
public void remove() {
throw new UnsupportedOperationException();
}
/**
* Advances the currentPosition pointer by 1 (if not exceeding length of
* inputText, and returns the character pointed by currentPosition.
*
* @return the next character EOF if no more characters exist
*/
private int getNextChar() {
if (reader != null) {
try {
int readVal = reader.read();
if (readVal == -1) {
currentChar = EOF;
} else {
currentChar = (char) readVal;
}
} catch (IOException ioe) {
currentChar = EOF;
errorDescription = ioe.getMessage();
}
} else if (inputText != null) {
if (currentPosition < inputText.length()) {
currentChar = (int) inputText.charAt(currentPosition);
} else {
currentChar = EOF;
}
}
if (currentChar != EOF) {
currentPosition++;
}
if (currentChar == '\n') {
lineNumber++;
}
return currentChar;
}
/**
* Starting from the current position of the input text, returns the
* subsequent characters of type charClass, and not of type
* singleCharSymbols.
*
* @param charClass the type of characters to look for
* @param buffer the place to append characters of type charClass
*
* @return a string of characters starting from the current position of the
* input text, until it encounters a character not in the string
* charClass
*
*/
private String getTokenOfCharClass(String charClass) {
return getTokenByCharClass(charClass, true);
}
/**
* Starting from the current position of the input text/file, returns the
* subsequent characters, not of type singleCharSymbols, and ended at
* characters of type endingCharClass. E.g., if the current string is
* "xxxxyyy", endingCharClass is "yz", and singleCharClass "abc". Then this
* method will return to "xxxx".
*
* @param endingCharClass the type of characters to look for
*
* @return a string of characters from the current position until it
* encounters characters in endingCharClass
*
*/
private String getTokenNotOfCharClass(String endingCharClass) {
return getTokenByCharClass(endingCharClass, false);
}
/**
* Provides a `compressed' method from getTokenOfCharClass() and
* getTokenNotOfCharClass(). If parameter containThisCharClass is
* <code>true</code>, then a string from the current position to the last
* character in charClass is returned. If containThisCharClass is
* <code>false</code> , then a string before the first occurrence of a
* character in containThisCharClass is returned.
*
* @param charClass the string of characters you want included or excluded
* in your return
* @param containThisCharClass determines if you want characters in
* charClass in the returned string or not
*
* @return a string of characters from the current position until it
* encounters characters in endingCharClass
*/
private String getTokenByCharClass(String charClass,
boolean containThisCharClass) {
final StringBuilder buffer = new StringBuilder();
// if we want the returned string to contain chars in charClass, then
// containThisCharClass is TRUE and
// (charClass.indexOf(currentChar) != 1) == containThisCharClass)
// returns true; if we want it to stop at characters of charClass,
// then containThisCharClass is FALSE, and the condition returns
// false.
while ((charClass.indexOf(currentChar) != -1) == containThisCharClass
&& singleCharSymbols.indexOf(currentChar) == -1
&& currentChar != EOF) {
buffer.append((char) currentChar);
getNextChar();
}
return buffer.toString();
}
/**
* Removes the postpunctuation characters from the current token. Copies
* those postpunctuation characters to the class variable
* 'postpunctuation'.
*/
private void removeTokenPostpunctuation() {
if (token == null) {
return;
}
final String tokenWord = token.getWord();
int tokenLength = tokenWord.length();
int position = tokenLength - 1;
while (position > 0
&& postpunctuationSymbols.indexOf((int) tokenWord
.charAt(position)) != -1) {
position--;
}
if (tokenLength - 1 != position) {
// Copy postpunctuation from token
token.setPostpunctuation(tokenWord.substring(position + 1));
// truncate token at postpunctuation
token.setWord(tokenWord.substring(0, position + 1));
} else {
token.setPostpunctuation("");
}
}
/**
* Returns <code>true</code> if there were errors while reading tokens
*
* @return <code>true</code> if there were errors; <code>false</code>
* otherwise
*/
public boolean hasErrors() {
return errorDescription != null;
}
/**
* if hasErrors returns <code>true</code>, this will return a description
* of the error encountered, otherwise it will return <code>null</code>
*
* @return a description of the last error that occurred.
*/
public String getErrorDescription() {
return errorDescription;
}
/**
* Determines if the current token should start a new sentence.
*
* @return <code>true</code> if a new sentence should be started
*/
public boolean isSentenceSeparator() {
String tokenWhiteSpace = token.getWhitespace();
String lastTokenPostpunctuation = null;
if (lastToken != null) {
lastTokenPostpunctuation = lastToken.getPostpunctuation();
}
if (lastToken == null || token == null) {
return false;
} else if (tokenWhiteSpace.indexOf('\n') != tokenWhiteSpace
.lastIndexOf('\n')) {
return true;
} else if (lastTokenPostpunctuation.indexOf(':') != -1
|| lastTokenPostpunctuation.indexOf('?') != -1
|| lastTokenPostpunctuation.indexOf('!') != -1) {
return true;
} else if (lastTokenPostpunctuation.indexOf('.') != -1
&& tokenWhiteSpace.length() > 1
&& Character.isUpperCase(token.getWord().charAt(0))) {
return true;
} else {
String lastWord = lastToken.getWord();
int lastWordLength = lastWord.length();
if (lastTokenPostpunctuation.indexOf('.') != -1
&&
/* next word starts with a capital */
Character.isUpperCase(token.getWord().charAt(0))
&&
/* last word isn't an abbreviation */
!(Character.isUpperCase(lastWord
.charAt(lastWordLength - 1)) || (lastWordLength < 4 && Character
.isUpperCase(lastWord.charAt(0))))) {
return true;
}
}
return false;
}
}