/* * LingPipe v. 3.8 * Copyright (C) 2003-2009 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */ package arkref.sent; import com.aliasi.tokenizer.Tokenizer; /** * @author Bob Carpenter * @version 3.8.1 * @since LingPipe1.0 */ class MyIndoEuropeanTokenizer extends Tokenizer { private final char[] mChars; private final int mLastPosition; private final int mStartPosition; private int mPosition; private int mTokenStart; private int mLastTokenIndex; private int mLastTokenStartPosition = -1; private int mLastTokenEndPosition = -1; /** * Construct a tokenizer from the specified character range. The * characters are not copied, so they should not be modified during * tokenization. * * @param ch Characters to tokenize. * @param offset Index of first character to tokenize. * @param length Number of characters to tokenize. * @throws IllegalArgumentException If the slice parameters are * out of bounds. */ public MyIndoEuropeanTokenizer(char[] ch, int offset, int length) { if (offset < 0 || offset + length > ch.length) { String msg = "Illegal slice." + " cs.length=" + ch.length + " offset=" + offset + " length=" + length; throw new IllegalArgumentException(msg); } mChars = ch; mPosition = offset; mLastPosition = offset+length; mTokenStart = -1; mLastTokenIndex = -1; mStartPosition = offset; } /** * Creates a tokenizer from the specified string. * * @param chars Characters to tokenize. */ public MyIndoEuropeanTokenizer(String chars) { this(chars.toCharArray(),0,chars.length()); } /** * Create a tokenizer from the specified string buffer. The * contents of the buffer are copied, so modifications to the * buffer do not affect tokenization. * * @param chars String buffer whose characters are tokenized. */ public MyIndoEuropeanTokenizer(StringBuilder chars) { this(chars.toString()); } @Override public int lastTokenStartPosition() { return mLastTokenStartPosition; } @Override public int lastTokenEndPosition() { return mLastTokenEndPosition; } /** * Returns the next whitespace. Returns the same result for * subsequent calls without a call to <code>nextToken</code>. * * @return The next space. */ @Override public String nextWhitespace() { StringBuilder sb = new StringBuilder(); while (hasMoreCharacters() && Character.isWhitespace(currentChar())) { sb.append(currentChar()); ++mPosition; } return sb.toString(); } /** * Returns <code>true</code> if the specified character is a * letter as determined by {@link Character#isLetter(char)} or is * a Devanagari character in the unicode range <code>0x0900</code> * to <code>0x097F</code>. * * @param c Character to test. * @return <code>true</code> if the character is a Java letter or * a Devanagari character. */ private static boolean isLetter(char c) { return Character.isLetter(c) || devanagari(c); } /** * Returns <code>true</code> if the specified character is in the * Devanagari range, unicode <code>0x0900</code> to * <code>0x097F</code>, inclusive. * * @param code Code number to test. * @return <code>true</code> if */ private static boolean devanagari(char unicode) { return (unicode >= 0x0900 && unicode <= 0x097F); } /** * Returns the next token in the stream, or <code>null</code> if * there are no more tokens. Flushes any whitespace that has * not been returned. * * @return The next token, or <code>null</code> if there are no * more tokens. */ @Override public String nextToken() { skipWhitespace(); if (!hasMoreCharacters()) return null; mTokenStart = mPosition; ++mLastTokenIndex; char startChar = mChars[mPosition++]; // update to deal with initial period digits properly if (startChar == '.') { while (currentCharEquals('.')) ++mPosition; return currentToken(); } if (startChar == '-') { while (currentCharEquals('-')) ++mPosition; return currentToken(); } if (startChar == '=') { while (currentCharEquals('=')) ++mPosition; return currentToken(); } if (startChar == '\'') { if (currentCharEquals('\'')) ++mPosition; return currentToken(); } if (startChar == '`') { if (currentCharEquals('`')) ++mPosition; return currentToken(); } if (isLetter(startChar)) return alphaNumToken(); if (Character.isDigit(startChar)) return numToken(); return currentToken(); // other single character symbol } /** * Returns <code>true</code> if there are more characters * in the input character sequence. * * @return <code>true</code> if there are more characters * to be tokenized. */ private boolean hasMoreCharacters() { return mPosition < mLastPosition; } /** * Returns the character in the underlying sequence at * the current position. * * @return The character in the underlying sequence at * the current position. */ private char currentChar() { return mChars[mPosition]; } /** * Returns <code>true</code> if there are more characters and the * current character is equal to the specified character. * * @param c Character to test. * @return <code>true</code> if the current character is equal to * the specified character. */ private boolean currentCharEquals(char c) { return hasMoreCharacters() && currentChar() == c; } /** * Advances the position to the first character of the * next token, or to the end of the file if there are * no more tokens. */ private void skipWhitespace() { while (hasMoreCharacters() && Character.isWhitespace(currentChar())) ++mPosition; } /** * Returns the current token as a string. * * @return Current token as a string. */ private String currentToken() { mLastTokenStartPosition = mTokenStart - mStartPosition; mLastTokenEndPosition = mPosition - mStartPosition; return new String(mChars,mTokenStart,mPosition-mTokenStart); } /** * Completes and returns a token that begins with the previous * letter character. * * @return Longest token extending the previous character. */ private String alphaNumToken() { while (hasMoreCharacters() && (isLetter(currentChar()) || Character.isDigit(currentChar()))) ++mPosition; return currentToken(); } /** * Completes and returns a token that begins with the previous * digit character. * * @return Token beginning at previous character, and extending * to all subsequent digits, commas, and periods. */ private String numToken() { while (hasMoreCharacters()) { if (isLetter(currentChar())) { ++mPosition; return alphaNumToken(); } if (Character.isDigit(currentChar())) { ++mPosition; continue; } if (currentChar() == '.' || currentChar() == ',') { return numPunctToken(); } return currentToken(); } return currentToken(); } /** * Completes and returns a token that begins with previous * numbers and commas or periods. * * @return Token beginning at previous character, and extending * to all subsequent digits, commas, and periods. */ private String numPunctToken() { while (hasMoreCharacters()) { if (Character.isDigit(currentChar())) { ++mPosition; } else if (currentChar() == '.' || currentChar() == ',') { ++mPosition; if (!hasMoreCharacters() || !Character.isDigit(currentChar())) { --mPosition; return currentToken(); } } else { return currentToken(); } } return currentToken(); } /** * Returns a tokenized version of the specified string. * * @param phrase Characters to tokenize. * @return Array of tokens generated by characters. */ public static String[] tokenize(String phrase) { return new MyIndoEuropeanTokenizer(phrase).tokenize(); } }