/* $Id: MyTokenizer.java 17887 2010-01-12 21:17:18Z linus $
*****************************************************************************
* Copyright (c) 2009 Contributors - see below
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* tfmorris
*****************************************************************************
*
* Some portions of this file was previously release using the BSD License:
*/
// Copyright (c) 1996-2006 The Regents of the University of California. All
// Rights Reserved. Permission to use, copy, modify, and distribute this
// software and its documentation without fee, and without a written
// agreement is hereby granted, provided that the above copyright notice
// and this paragraph appear in all copies. This software program and
// documentation are copyrighted by The Regents of the University of
// California. The software program and documentation are supplied "AS
// IS", without any accompanying services from The Regents. The Regents
// does not warrant that the operation of the program will be
// uninterrupted or error-free. The end-user understands that the program
// was developed for research purposes and is advised not to rely
// exclusively on the program for any reason. IN NO EVENT SHALL THE
// UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
// SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS,
// ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
// THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
// SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE
// PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF
// CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT,
// UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
package org.argouml.util;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Enumeration;
import java.util.List;
import java.util.NoSuchElementException;
/**
* Internal class for managing the delimiters in MyTokenizer. It's rather
* similar to CustomSeparator, but faster for short constant strings.
*/
class TokenSep {
private TokenSep next = null;
private final String theString;
private final int length;
private int pattern;
/**
* Constructs a TokenSep that will match the String given in str.
*
* @param str The delimiter string.
*/
public TokenSep(String str) {
theString = str;
length = str.length();
if (length > 32)
throw new IllegalArgumentException("TokenSep " + str
+ " is " + length + " (> 32) chars long");
pattern = 0;
}
/**
* Called by MyTokenizer when a new character is processed in the
* sequence. Returns true if we have found the delimiter.
*/
public boolean addChar(char c) {
int i;
pattern <<= 1;
pattern |= 1;
for (i = 0; i < length; i++) {
if (theString.charAt(i) != c) {
pattern &= ~(1 << i);
}
}
return (pattern & (1 << (length - 1))) != 0;
}
/**
* Called by MyTokenizer before starting scanning for a new token.
*/
public void reset() {
pattern = 0;
}
/**
* Gets the length of this token.
*/
public int length() {
return length;
}
/**
* Gets this token.
*/
public String getString() {
return theString;
}
/**
* @param n The next to set.
*/
public void setNext(TokenSep n) {
this.next = n;
}
/**
* @return Returns the next.
*/
public TokenSep getNext() {
return next;
}
}
/**
* A descendent of CustomSeparator that recognizes tokens on one of two forms:
* <ul>
* <li><pre>'chr'.....'esc' 'chr'.....'chr'</pre>
* <li><pre>'lchr'...'lchr'...'rchr'...'esc' 'rchr'....'rchr'</pre></ul>
*
* <p>The first form is suited for quoted strings, like <pre>"...\"...."</pre>
* or <pre>'...\'...'</pre>.
*
* <p>The second form is suited for expressions, like
* <pre>(a+(b*c)-15*eq(a, b))</pre>.
*
* <p>This is in fact the class currently used for the public separators in
* MyTokenizer, except PAREN_EXPR_STRING_SEPARATOR and LINE_SEPARATOR.
*/
class QuotedStringSeparator extends CustomSeparator {
private final char escChr;
private final char startChr;
private final char stopChr;
private boolean esced;
private int tokLen;
private int level;
/**
* Creates a separator of the first form (see above) where
* 'chr' = q and 'esc' = esc.
*
* @param q The delimiter character.
* @param esc The escape character.
*/
public QuotedStringSeparator(char q, char esc) {
super(q);
esced = false;
escChr = esc;
startChr = 0;
stopChr = q;
tokLen = 0;
level = 1;
}
/**
* Creates a separator of the second form (see above) where
* 'lchr' = sq, 'rchr' = eq and 'esc' = esc.
*
* @param sq The left delimiter character.
* @param eq The right delimiter character.
* @param esc The escape character.
*/
public QuotedStringSeparator(char sq, char eq, char esc) {
super(sq);
esced = false;
escChr = esc;
startChr = sq;
stopChr = eq;
tokLen = 0;
level = 1;
}
public void reset() {
super.reset();
tokLen = 0;
level = 1;
}
/**
* {@inheritDoc}
*
* Overridden to return the entire length of the token.
*/
public int tokenLength() {
return super.tokenLength() + tokLen;
}
/**
* {@inheritDoc}
*
* Overridden to return true.
*
* @return true
*/
public boolean hasFreePart() {
return true;
}
/**
* {@inheritDoc}
*
* Overridden to find the end of the token.
*/
public boolean endChar(char c) {
tokLen++;
if (esced) {
esced = false;
return false;
}
if (escChr != 0 && c == escChr) {
esced = true;
return false;
}
if (startChr != 0 && c == startChr)
level++;
if (c == stopChr)
level--;
return level <= 0;
}
}
/**
* A descendent of CustomSeparator that recognizes tokens on the form:
*
* <br>( " \" ) " ' \' ) ' )
*
* <p>This is, an expression inside parentheses with proper consideration
* for quoted strings inside the the expression.
*/
class ExprSeparatorWithStrings extends CustomSeparator {
private boolean isSQuot;
private boolean isDQuot;
private boolean isEsc;
private int tokLevel;
private int tokLen;
/**
* The constructor. No choices available.
*/
public ExprSeparatorWithStrings() {
super('(');
isEsc = false;
isSQuot = false;
isDQuot = false;
tokLevel = 1;
tokLen = 0;
}
public void reset() {
super.reset();
isEsc = false;
isSQuot = false;
isDQuot = false;
tokLevel = 1;
tokLen = 0;
}
/**
* {@inheritDoc}
*
* Overridden to return the entire length of the token.
*/
public int tokenLength() {
return super.tokenLength() + tokLen;
}
/**
* {@inheritDoc}
*
* Overridden to return true.
*
* @return true
*/
public boolean hasFreePart() {
return true;
}
/**
* {@inheritDoc}
*
* Overridden to find the end of the token.
*/
public boolean endChar(char c) {
tokLen++;
if (isSQuot) {
if (isEsc) {
isEsc = false;
return false;
}
if (c == '\\')
isEsc = true;
else if (c == '\'')
isSQuot = false;
return false;
} else if (isDQuot) {
if (isEsc) {
isEsc = false;
return false;
}
if (c == '\\')
isEsc = true;
else if (c == '\"')
isDQuot = false;
return false;
} else {
if (c == '\'')
isSQuot = true;
else if (c == '\"')
isDQuot = true;
else if (c == '(')
tokLevel++;
else if (c == ')')
tokLevel--;
return tokLevel <= 0;
}
}
}
/**
* A descendent of CustomSeparator that recognizes "the tree line ends":
* <ul>
* <li>UNIX: <lf></li>
* <li>DOS: <cr> <lf></li>
* <li>MAC: <cr></li>
* </ul>
*
* <p>This is in fact the class currently used LINE_SEPARATOR in MyTokenizer.
*/
class LineSeparator extends CustomSeparator {
private boolean hasCr;
private boolean hasLf;
private boolean hasPeeked;
/**
* Creates a LineSeparator.
*/
public LineSeparator() {
hasCr = false;
hasLf = false;
hasPeeked = false;
}
public void reset() {
super.reset();
hasCr = false;
hasLf = false;
hasPeeked = false;
}
/**
* {@inheritDoc}
*/
public int tokenLength() {
return hasCr && hasLf ? 2 : 1;
}
/**
* {@inheritDoc}
*/
public int getPeekCount() {
return hasPeeked ? 1 : 0;
}
/**
* {@inheritDoc}
*/
public boolean hasFreePart() {
return !hasLf;
}
/**
* {@inheritDoc}
*
* Overridden to find the start of a line-end.
*/
public boolean addChar(char c) {
if (c == '\n') {
hasLf = true;
return true;
}
if (c == '\r') {
hasCr = true;
return true;
}
return false;
}
/**
* {@inheritDoc}
*
* Overridden to find the end of a line-end.
*/
public boolean endChar(char c) {
if (c == '\n') {
hasLf = true;
} else {
hasPeeked = true;
}
return true;
}
}
/**
* Class for dividing a String into any number of parts. Each part will be a
* substring of the original String. The first part will at least contain the
* first character in the string. All following parts will at least contain
* the first character in the String not covered by any previous part.
*
* <p>The delim parameter to the constructors is a comma separated list of
* tokens that should be recognized by the tokenizer. These tokens will be
* returned by the tokenizer as tokens, and any arbitrary text between them
* will also be returned as tokens. Since the comma has special meaning in
* this string, it can be escaped with \ to only mean itself (like in "\\,").
* For technical reasons it is not possible for any token in this list to be
* more than 32 characters long.
*
* <p>In addition to the delim parameter it is also possible to use custom
* separators that allow any string that can be generated by the limited
* version of a Turing machine that your computer is, to be used as a
* delimiter.
*
* <p>There are some custom separators provided that you can use to get
* things like strings in one token. These cannot be used simultaneously by
* several tokenizers, ie they are not thread safe.
*
* <p>The tokenizer works in a kind of greedy way. When the first separator
* token from delim is matched or any CustomSeparator returns true from
* addChar, then it is satisfied it has found a token and does NOT check if
* it could have found a longer token. Eg: if you have this delim string
* "<,<<", then "<<" will never be found.
*
* <p><b>Example</b><br><pre>
* MyTokenizer tzer = new MyTokenizer("Hello, how are you?", " ,\\,");
* while (tzer.hasMoreTokens())
* _cat.info("\"" + tzer.nextToken() + "\"");
* </pre>
*
* <p>Which whould yield the following output:<pre>
* "Hello"
* ","
* " "
* "how"
* " "
* "are"
* " "
* "you?"
* </pre>
*
* @author Michael Stockman
* @since 0.11.2
* @see CustomSeparator
*/
public class MyTokenizer implements Enumeration {
/** A custom separator for quoted strings enclosed in single quotes
* and using \ as escape character. There may not be an end quote
* if the tokenizer reaches the end of the String. */
public static final CustomSeparator SINGLE_QUOTED_SEPARATOR =
new QuotedStringSeparator('\'', '\\');
/** A custom separator for quoted strings enclosed in double quotes
* and using \ as escape character. There may not be an end quote
* if the tokenizer reaches the end of the String. */
public static final CustomSeparator DOUBLE_QUOTED_SEPARATOR =
new QuotedStringSeparator('\"', '\\');
/** A custom separator for expressions enclosed in parentheses and
* matching lparams with rparams. There may not be proper matching
* if the tokenizer reaches the end of the String. Do not use this
* together with PAREN_EXPR_STRING_SEPARATOR. */
public static final CustomSeparator PAREN_EXPR_SEPARATOR =
new QuotedStringSeparator('(', ')', '\0');
/** A custom separator for expressions enclosed in parentheses and
* matching lparams with rparams. There may not be proper matching
* if the tokenizer reaches the end of the String. It also takes
* quoted strings (either single or double quotes) in the expression
* into consideration, unlike PAREN_EXPR_SEPARATOR. Do not use this
* together with PAREN_EXPR_SEPARATOR. */
public static final CustomSeparator PAREN_EXPR_STRING_SEPARATOR =
new ExprSeparatorWithStrings();
/** A custom separator for texts. Singles out the line ends,
* and consequently the lines, if they are in either dos, mac
* or unix format. */
public static final CustomSeparator LINE_SEPARATOR =
new LineSeparator();
private int sIdx;
private final int eIdx;
private int tokIdx;
private final String source;
private final TokenSep delims;
private String savedToken;
private int savedIdx;
private List customSeps;
private String putToken;
/**
* Constructs a new instance. See above for a description of the
* delimiter string.
*
* @param string The String to be tokenized.
* @param delim The String of delimiters.
*/
public MyTokenizer(String string, String delim) {
source = string;
delims = parseDelimString(delim);
sIdx = 0;
tokIdx = 0;
eIdx = string.length();
savedToken = null;
customSeps = null;
putToken = null;
}
/**
* Constructs a new instance. See above for a description of the
* delimiter string and custom separators.
*
* @param string The String to be tokenized.
* @param delim The String of delimiters.
* @param sep A custom separator to use.
*/
public MyTokenizer(String string, String delim, CustomSeparator sep) {
source = string;
delims = parseDelimString(delim);
sIdx = 0;
tokIdx = 0;
eIdx = string.length();
savedToken = null;
customSeps = new ArrayList();
customSeps.add(sep);
}
/**
* Constructs a new instance. See above for a description of the
* delimiter string and custom separators.
*
* @param string The String to be tokenized.
* @param delim The String of delimiters.
* @param seps Some container with custom separators to use.
*/
public MyTokenizer(String string, String delim, Collection seps) {
source = string;
delims = parseDelimString(delim);
sIdx = 0;
tokIdx = 0;
eIdx = string.length();
savedToken = null;
customSeps = new ArrayList(seps);
}
/**
* Returns true if there are more tokens left.
*
* @return true if another token can be fetched with nextToken.
*/
public boolean hasMoreTokens() {
return sIdx < eIdx || savedToken != null
|| putToken != null;
}
/**
* Retrives the next token.
*
* @return The next token.
*/
public String nextToken() {
CustomSeparator csep;
TokenSep sep;
String s = null;
int i, j;
if (putToken != null) {
s = putToken;
putToken = null;
return s;
}
if (savedToken != null) {
s = savedToken;
tokIdx = savedIdx;
savedToken = null;
return s;
}
if (sIdx >= eIdx)
throw new NoSuchElementException(
"No more tokens available");
for (sep = delims; sep != null; sep = sep.getNext())
sep.reset();
if (customSeps != null) {
for (i = 0; i < customSeps.size(); i++)
((CustomSeparator) customSeps.get(i)).reset();
}
for (i = sIdx; i < eIdx; i++) {
char c = source.charAt(i);
for (j = 0; customSeps != null
&& j < customSeps.size(); j++) {
csep = (CustomSeparator) customSeps.get(j);
if (csep.addChar(c))
break;
}
if (customSeps != null && j < customSeps.size()) {
csep = (CustomSeparator) customSeps.get(j);
while (csep.hasFreePart() && i + 1 < eIdx)
if (csep.endChar(source.charAt(++i)))
break;
i -= Math.min(csep.getPeekCount(), i);
int clen = Math.min(i + 1, source.length());
if (i - sIdx + 1 > csep.tokenLength()) {
s = source.substring(sIdx,
i - csep.tokenLength() + 1);
savedIdx = i - csep.tokenLength() + 1;
savedToken = source.substring(
savedIdx, clen);
} else {
s = source.substring(sIdx, clen);
}
tokIdx = sIdx;
sIdx = i + 1;
break;
}
for (sep = delims; sep != null; sep = sep.getNext())
if (sep.addChar(c))
break;
if (sep != null) {
if (i - sIdx + 1 > sep.length()) {
s = source.substring(sIdx,
i - sep.length() + 1);
savedIdx = i - sep.length() + 1;
savedToken = sep.getString();
} else {
s = sep.getString();
}
tokIdx = sIdx;
sIdx = i + 1;
break;
}
}
if (s == null) {
s = source.substring(sIdx);
tokIdx = sIdx;
sIdx = eIdx;
}
return s;
}
/**
* This class implements the Enumeration interface. This call maps
* to nextToken.
*
* @return nextToken();
* @see #nextToken() nextToken
*/
public Object nextElement() {
return nextToken();
}
/**
* This class implements the Enumeration interface. This call maps
* to hasMoreTokens.
*
* @return hasMoreTokens();
* @see #hasMoreTokens() hasMoreTokens
*/
public boolean hasMoreElements() {
return hasMoreTokens();
}
/**
* Returns the index in the string of the last token returned by
* nextToken, or zero if no token has been retrived.
*
* @return The index of the last token.
*/
public int getTokenIndex() {
return tokIdx;
}
/**
* Put a token on the input stream. This will be the next token read
* from the tokenizer. If this function is called again before the
* last token has been read, then it will be lost.
*
* <p>The index returned from getTokenIndex will be the same for the
* token put as that of the last token that wasn't put.
*
* @param s The token to put.
* @throws NullPointerException if s is null.
*/
public void putToken(String s) {
if (s == null)
throw new NullPointerException(
"Cannot put a null token");
putToken = s;
}
/**
* Creates a linked list of TokenSeps from the comma separated string
* str.
*
* @param str The string specifying delimiter strings.
* @return A list of TokenSeps.
*/
private static TokenSep parseDelimString(String str) {
TokenSep first = null;
TokenSep p = null;
int idx0, idx1, length;
StringBuilder val = new StringBuilder();
char c;
length = str.length();
for (idx0 = 0; idx0 < length;) {
for (idx1 = idx0; idx1 < length; idx1++) {
c = str.charAt(idx1);
if (c == '\\') {
idx1++;
if (idx1 < length)
val.append(str.charAt(idx1));
} else if (c == ',') {
break;
} else {
val.append(c);
}
}
idx1 = Math.min(idx1, length);
if (idx1 > idx0) {
p = new TokenSep(val.toString());
val = new StringBuilder();
p.setNext(first);
first = p;
}
idx0 = idx1 + 1;
}
return first;
}
}