/* $Id: MyTokenizer.java 17887 2010-01-12 21:17:18Z linus $ ***************************************************************************** * Copyright (c) 2009 Contributors - see below * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * tfmorris ***************************************************************************** * * Some portions of this file was previously release using the BSD License: */ // Copyright (c) 1996-2006 The Regents of the University of California. All // Rights Reserved. Permission to use, copy, modify, and distribute this // software and its documentation without fee, and without a written // agreement is hereby granted, provided that the above copyright notice // and this paragraph appear in all copies. This software program and // documentation are copyrighted by The Regents of the University of // California. The software program and documentation are supplied "AS // IS", without any accompanying services from The Regents. The Regents // does not warrant that the operation of the program will be // uninterrupted or error-free. The end-user understands that the program // was developed for research purposes and is advised not to rely // exclusively on the program for any reason. IN NO EVENT SHALL THE // UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, // SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, // ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF // THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF // SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE // PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF // CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, // UPDATES, ENHANCEMENTS, OR MODIFICATIONS. package org.argouml.util; import java.util.ArrayList; import java.util.Collection; import java.util.Enumeration; import java.util.List; import java.util.NoSuchElementException; /** * Internal class for managing the delimiters in MyTokenizer. It's rather * similar to CustomSeparator, but faster for short constant strings. */ class TokenSep { private TokenSep next = null; private final String theString; private final int length; private int pattern; /** * Constructs a TokenSep that will match the String given in str. * * @param str The delimiter string. */ public TokenSep(String str) { theString = str; length = str.length(); if (length > 32) throw new IllegalArgumentException("TokenSep " + str + " is " + length + " (> 32) chars long"); pattern = 0; } /** * Called by MyTokenizer when a new character is processed in the * sequence. Returns true if we have found the delimiter. */ public boolean addChar(char c) { int i; pattern <<= 1; pattern |= 1; for (i = 0; i < length; i++) { if (theString.charAt(i) != c) { pattern &= ~(1 << i); } } return (pattern & (1 << (length - 1))) != 0; } /** * Called by MyTokenizer before starting scanning for a new token. */ public void reset() { pattern = 0; } /** * Gets the length of this token. */ public int length() { return length; } /** * Gets this token. */ public String getString() { return theString; } /** * @param n The next to set. */ public void setNext(TokenSep n) { this.next = n; } /** * @return Returns the next. */ public TokenSep getNext() { return next; } } /** * A descendent of CustomSeparator that recognizes tokens on one of two forms: * <ul> * <li><pre>'chr'.....'esc' 'chr'.....'chr'</pre> * <li><pre>'lchr'...'lchr'...'rchr'...'esc' 'rchr'....'rchr'</pre></ul> * * <p>The first form is suited for quoted strings, like <pre>"...\"...."</pre> * or <pre>'...\'...'</pre>. * * <p>The second form is suited for expressions, like * <pre>(a+(b*c)-15*eq(a, b))</pre>. * * <p>This is in fact the class currently used for the public separators in * MyTokenizer, except PAREN_EXPR_STRING_SEPARATOR and LINE_SEPARATOR. */ class QuotedStringSeparator extends CustomSeparator { private final char escChr; private final char startChr; private final char stopChr; private boolean esced; private int tokLen; private int level; /** * Creates a separator of the first form (see above) where * 'chr' = q and 'esc' = esc. * * @param q The delimiter character. * @param esc The escape character. */ public QuotedStringSeparator(char q, char esc) { super(q); esced = false; escChr = esc; startChr = 0; stopChr = q; tokLen = 0; level = 1; } /** * Creates a separator of the second form (see above) where * 'lchr' = sq, 'rchr' = eq and 'esc' = esc. * * @param sq The left delimiter character. * @param eq The right delimiter character. * @param esc The escape character. */ public QuotedStringSeparator(char sq, char eq, char esc) { super(sq); esced = false; escChr = esc; startChr = sq; stopChr = eq; tokLen = 0; level = 1; } public void reset() { super.reset(); tokLen = 0; level = 1; } /** * {@inheritDoc} * * Overridden to return the entire length of the token. */ public int tokenLength() { return super.tokenLength() + tokLen; } /** * {@inheritDoc} * * Overridden to return true. * * @return true */ public boolean hasFreePart() { return true; } /** * {@inheritDoc} * * Overridden to find the end of the token. */ public boolean endChar(char c) { tokLen++; if (esced) { esced = false; return false; } if (escChr != 0 && c == escChr) { esced = true; return false; } if (startChr != 0 && c == startChr) level++; if (c == stopChr) level--; return level <= 0; } } /** * A descendent of CustomSeparator that recognizes tokens on the form: * * <br>( " \" ) " ' \' ) ' ) * * <p>This is, an expression inside parentheses with proper consideration * for quoted strings inside the the expression. */ class ExprSeparatorWithStrings extends CustomSeparator { private boolean isSQuot; private boolean isDQuot; private boolean isEsc; private int tokLevel; private int tokLen; /** * The constructor. No choices available. */ public ExprSeparatorWithStrings() { super('('); isEsc = false; isSQuot = false; isDQuot = false; tokLevel = 1; tokLen = 0; } public void reset() { super.reset(); isEsc = false; isSQuot = false; isDQuot = false; tokLevel = 1; tokLen = 0; } /** * {@inheritDoc} * * Overridden to return the entire length of the token. */ public int tokenLength() { return super.tokenLength() + tokLen; } /** * {@inheritDoc} * * Overridden to return true. * * @return true */ public boolean hasFreePart() { return true; } /** * {@inheritDoc} * * Overridden to find the end of the token. */ public boolean endChar(char c) { tokLen++; if (isSQuot) { if (isEsc) { isEsc = false; return false; } if (c == '\\') isEsc = true; else if (c == '\'') isSQuot = false; return false; } else if (isDQuot) { if (isEsc) { isEsc = false; return false; } if (c == '\\') isEsc = true; else if (c == '\"') isDQuot = false; return false; } else { if (c == '\'') isSQuot = true; else if (c == '\"') isDQuot = true; else if (c == '(') tokLevel++; else if (c == ')') tokLevel--; return tokLevel <= 0; } } } /** * A descendent of CustomSeparator that recognizes "the tree line ends": * <ul> * <li>UNIX: <lf></li> * <li>DOS: <cr> <lf></li> * <li>MAC: <cr></li> * </ul> * * <p>This is in fact the class currently used LINE_SEPARATOR in MyTokenizer. */ class LineSeparator extends CustomSeparator { private boolean hasCr; private boolean hasLf; private boolean hasPeeked; /** * Creates a LineSeparator. */ public LineSeparator() { hasCr = false; hasLf = false; hasPeeked = false; } public void reset() { super.reset(); hasCr = false; hasLf = false; hasPeeked = false; } /** * {@inheritDoc} */ public int tokenLength() { return hasCr && hasLf ? 2 : 1; } /** * {@inheritDoc} */ public int getPeekCount() { return hasPeeked ? 1 : 0; } /** * {@inheritDoc} */ public boolean hasFreePart() { return !hasLf; } /** * {@inheritDoc} * * Overridden to find the start of a line-end. */ public boolean addChar(char c) { if (c == '\n') { hasLf = true; return true; } if (c == '\r') { hasCr = true; return true; } return false; } /** * {@inheritDoc} * * Overridden to find the end of a line-end. */ public boolean endChar(char c) { if (c == '\n') { hasLf = true; } else { hasPeeked = true; } return true; } } /** * Class for dividing a String into any number of parts. Each part will be a * substring of the original String. The first part will at least contain the * first character in the string. All following parts will at least contain * the first character in the String not covered by any previous part. * * <p>The delim parameter to the constructors is a comma separated list of * tokens that should be recognized by the tokenizer. These tokens will be * returned by the tokenizer as tokens, and any arbitrary text between them * will also be returned as tokens. Since the comma has special meaning in * this string, it can be escaped with \ to only mean itself (like in "\\,"). * For technical reasons it is not possible for any token in this list to be * more than 32 characters long. * * <p>In addition to the delim parameter it is also possible to use custom * separators that allow any string that can be generated by the limited * version of a Turing machine that your computer is, to be used as a * delimiter. * * <p>There are some custom separators provided that you can use to get * things like strings in one token. These cannot be used simultaneously by * several tokenizers, ie they are not thread safe. * * <p>The tokenizer works in a kind of greedy way. When the first separator * token from delim is matched or any CustomSeparator returns true from * addChar, then it is satisfied it has found a token and does NOT check if * it could have found a longer token. Eg: if you have this delim string * "<,<<", then "<<" will never be found. * * <p><b>Example</b><br><pre> * MyTokenizer tzer = new MyTokenizer("Hello, how are you?", " ,\\,"); * while (tzer.hasMoreTokens()) * _cat.info("\"" + tzer.nextToken() + "\""); * </pre> * * <p>Which whould yield the following output:<pre> * "Hello" * "," * " " * "how" * " " * "are" * " " * "you?" * </pre> * * @author Michael Stockman * @since 0.11.2 * @see CustomSeparator */ public class MyTokenizer implements Enumeration { /** A custom separator for quoted strings enclosed in single quotes * and using \ as escape character. There may not be an end quote * if the tokenizer reaches the end of the String. */ public static final CustomSeparator SINGLE_QUOTED_SEPARATOR = new QuotedStringSeparator('\'', '\\'); /** A custom separator for quoted strings enclosed in double quotes * and using \ as escape character. There may not be an end quote * if the tokenizer reaches the end of the String. */ public static final CustomSeparator DOUBLE_QUOTED_SEPARATOR = new QuotedStringSeparator('\"', '\\'); /** A custom separator for expressions enclosed in parentheses and * matching lparams with rparams. There may not be proper matching * if the tokenizer reaches the end of the String. Do not use this * together with PAREN_EXPR_STRING_SEPARATOR. */ public static final CustomSeparator PAREN_EXPR_SEPARATOR = new QuotedStringSeparator('(', ')', '\0'); /** A custom separator for expressions enclosed in parentheses and * matching lparams with rparams. There may not be proper matching * if the tokenizer reaches the end of the String. It also takes * quoted strings (either single or double quotes) in the expression * into consideration, unlike PAREN_EXPR_SEPARATOR. Do not use this * together with PAREN_EXPR_SEPARATOR. */ public static final CustomSeparator PAREN_EXPR_STRING_SEPARATOR = new ExprSeparatorWithStrings(); /** A custom separator for texts. Singles out the line ends, * and consequently the lines, if they are in either dos, mac * or unix format. */ public static final CustomSeparator LINE_SEPARATOR = new LineSeparator(); private int sIdx; private final int eIdx; private int tokIdx; private final String source; private final TokenSep delims; private String savedToken; private int savedIdx; private List customSeps; private String putToken; /** * Constructs a new instance. See above for a description of the * delimiter string. * * @param string The String to be tokenized. * @param delim The String of delimiters. */ public MyTokenizer(String string, String delim) { source = string; delims = parseDelimString(delim); sIdx = 0; tokIdx = 0; eIdx = string.length(); savedToken = null; customSeps = null; putToken = null; } /** * Constructs a new instance. See above for a description of the * delimiter string and custom separators. * * @param string The String to be tokenized. * @param delim The String of delimiters. * @param sep A custom separator to use. */ public MyTokenizer(String string, String delim, CustomSeparator sep) { source = string; delims = parseDelimString(delim); sIdx = 0; tokIdx = 0; eIdx = string.length(); savedToken = null; customSeps = new ArrayList(); customSeps.add(sep); } /** * Constructs a new instance. See above for a description of the * delimiter string and custom separators. * * @param string The String to be tokenized. * @param delim The String of delimiters. * @param seps Some container with custom separators to use. */ public MyTokenizer(String string, String delim, Collection seps) { source = string; delims = parseDelimString(delim); sIdx = 0; tokIdx = 0; eIdx = string.length(); savedToken = null; customSeps = new ArrayList(seps); } /** * Returns true if there are more tokens left. * * @return true if another token can be fetched with nextToken. */ public boolean hasMoreTokens() { return sIdx < eIdx || savedToken != null || putToken != null; } /** * Retrives the next token. * * @return The next token. */ public String nextToken() { CustomSeparator csep; TokenSep sep; String s = null; int i, j; if (putToken != null) { s = putToken; putToken = null; return s; } if (savedToken != null) { s = savedToken; tokIdx = savedIdx; savedToken = null; return s; } if (sIdx >= eIdx) throw new NoSuchElementException( "No more tokens available"); for (sep = delims; sep != null; sep = sep.getNext()) sep.reset(); if (customSeps != null) { for (i = 0; i < customSeps.size(); i++) ((CustomSeparator) customSeps.get(i)).reset(); } for (i = sIdx; i < eIdx; i++) { char c = source.charAt(i); for (j = 0; customSeps != null && j < customSeps.size(); j++) { csep = (CustomSeparator) customSeps.get(j); if (csep.addChar(c)) break; } if (customSeps != null && j < customSeps.size()) { csep = (CustomSeparator) customSeps.get(j); while (csep.hasFreePart() && i + 1 < eIdx) if (csep.endChar(source.charAt(++i))) break; i -= Math.min(csep.getPeekCount(), i); int clen = Math.min(i + 1, source.length()); if (i - sIdx + 1 > csep.tokenLength()) { s = source.substring(sIdx, i - csep.tokenLength() + 1); savedIdx = i - csep.tokenLength() + 1; savedToken = source.substring( savedIdx, clen); } else { s = source.substring(sIdx, clen); } tokIdx = sIdx; sIdx = i + 1; break; } for (sep = delims; sep != null; sep = sep.getNext()) if (sep.addChar(c)) break; if (sep != null) { if (i - sIdx + 1 > sep.length()) { s = source.substring(sIdx, i - sep.length() + 1); savedIdx = i - sep.length() + 1; savedToken = sep.getString(); } else { s = sep.getString(); } tokIdx = sIdx; sIdx = i + 1; break; } } if (s == null) { s = source.substring(sIdx); tokIdx = sIdx; sIdx = eIdx; } return s; } /** * This class implements the Enumeration interface. This call maps * to nextToken. * * @return nextToken(); * @see #nextToken() nextToken */ public Object nextElement() { return nextToken(); } /** * This class implements the Enumeration interface. This call maps * to hasMoreTokens. * * @return hasMoreTokens(); * @see #hasMoreTokens() hasMoreTokens */ public boolean hasMoreElements() { return hasMoreTokens(); } /** * Returns the index in the string of the last token returned by * nextToken, or zero if no token has been retrived. * * @return The index of the last token. */ public int getTokenIndex() { return tokIdx; } /** * Put a token on the input stream. This will be the next token read * from the tokenizer. If this function is called again before the * last token has been read, then it will be lost. * * <p>The index returned from getTokenIndex will be the same for the * token put as that of the last token that wasn't put. * * @param s The token to put. * @throws NullPointerException if s is null. */ public void putToken(String s) { if (s == null) throw new NullPointerException( "Cannot put a null token"); putToken = s; } /** * Creates a linked list of TokenSeps from the comma separated string * str. * * @param str The string specifying delimiter strings. * @return A list of TokenSeps. */ private static TokenSep parseDelimString(String str) { TokenSep first = null; TokenSep p = null; int idx0, idx1, length; StringBuilder val = new StringBuilder(); char c; length = str.length(); for (idx0 = 0; idx0 < length;) { for (idx1 = idx0; idx1 < length; idx1++) { c = str.charAt(idx1); if (c == '\\') { idx1++; if (idx1 < length) val.append(str.charAt(idx1)); } else if (c == ',') { break; } else { val.append(c); } } idx1 = Math.min(idx1, length); if (idx1 > idx0) { p = new TokenSep(val.toString()); val = new StringBuilder(); p.setNext(first); first = p; } idx0 = idx1 + 1; } return first; } }