/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * Imported from Apache Harmony CG 20080703, based on revision 627408. */ package java.io; /** * StreamTokenizer takes a stream and a set of tokens and parses them one at a * time. The different types of tokens that can be found are numbers, * identifiers, quoted strings, and different comment styles. */ public class StreamTokenizer { /** * Contains a number if the current token is a number (<code>ttype</code> * is <code>TT_NUMBER</code>) */ public double nval; /** * Contains a string if the current token is a word (<code>ttype</code> * is <code>TT_WORD</code>) */ public String sval; /** * After calling <code>nextToken</code>, the field <code>ttype</code> * contains the type of token that has been read. When a single character is * read, it's integer value is used. For a quoted string, the value is the * quoted character. If not one of those, then it is one of the following: * <UL> * <LI> <code>TT_WORD</code> - the token is a word.</LI> * <LI> <code>TT_NUMBER</code> - the token is a number.</LI> * <LI> <code>TT_EOL</code> - the end of line has been reached. Depends on * whether <code>eolIsSignificant</code> is <code>true</code>.</LI> * <LI> <code>TT_EOF</code> - the end of the stream has been reached.</LI> * </UL> */ /** * The constant representing end of stream. */ public static final int TT_EOF = -1; /** * The constant representing end of line. */ public static final int TT_EOL = '\n'; /** * The constant representing a number token. */ public static final int TT_NUMBER = -2; /** * The constant representing a word token. */ public static final int TT_WORD = -3; /** * Internal representation of unknown state. */ private static final int TT_UNKNOWN = -4; /** * The token type */ public int ttype = TT_UNKNOWN; /** * Internal character meanings, 0 implies TOKEN_ORDINARY */ private byte tokenTypes[] = new byte[256]; private static final byte TOKEN_COMMENT = 1; private static final byte TOKEN_QUOTE = 2; private static final byte TOKEN_WHITE = 4; private static final byte TOKEN_WORD = 8; private static final byte TOKEN_DIGIT = 16; private int lineNumber = 1; private boolean forceLowercase; private boolean isEOLSignificant; private boolean slashStarComments; private boolean slashSlashComments; private boolean pushBackToken; private boolean lastCr; /* One of these will have the stream */ private InputStream inStream; private Reader inReader; private int peekChar = -2; /** * Private constructor to initialize the default values according to the * specification. */ private StreamTokenizer() { /* * Initialize the default state per specification. All byte values 'A' * through 'Z', 'a' through 'z', and '\u00A0' through '\u00FF' are * considered to be alphabetic. */ wordChars('A', 'Z'); wordChars('a', 'z'); wordChars(160, 255); /** * All byte values '\u0000' through '\u0020' are considered to be white * space. */ whitespaceChars(0, 32); /** * '/' is a comment character. Single quote '\'' and double quote '"' * are string quote characters. */ commentChar('/'); quoteChar('"'); quoteChar('\''); /** * Numbers are parsed. */ parseNumbers(); /** * Ends of lines are treated as white space, not as separate tokens. * C-style and C++-style comments are not recognized. These are the * defaults and are not needed in constructor. */ } /** * Construct a new StreamTokenizer on the InputStream is. This usage of this * method should be replaced with the constructor which takes a Reader. * * @param is * The InputStream to parse tokens on. * * @deprecated Use StreamTokenizer(Reader) */ public StreamTokenizer(InputStream is) { this(); if (is == null) { throw new NullPointerException(); } inStream = is; } /** * Construct a new StreamTokenizer on the Reader <code>r</code>. * Initialize the default state per specification. * <UL> * <LI>All byte values 'A' through 'Z', 'a' through 'z', and '\u00A0' * through '\u00FF' are considered to be alphabetic.</LI> * <LI>All byte values '\u0000' through '\u0020' are considered to * be white space. '/' is a comment character.</LI> * <LI>Single quote '\'' and double quote '"' are string quote characters.</LI> * <LI>Numbers are parsed.</LI> * <LI>Ends of lines are considered to be white space rather than separate * tokens.</LI> * <LI>C-style and C++-style comments are not recognized.</LI> * </UL> * These are the defaults and are not needed in constructor. * * @param r * The InputStream to parse tokens on. */ public StreamTokenizer(Reader r) { this(); if (r == null) { throw new NullPointerException(); } inReader = r; } /** * Set the character <code>ch</code> to be regarded as a comment * character. * * @param ch * The character to be considered a comment character. */ public void commentChar(int ch) { if (0 <= ch && ch < tokenTypes.length) { tokenTypes[ch] = TOKEN_COMMENT; } } /** * Set a boolean indicating whether or not end of line is significant and * should be returned as <code>TT_EOF</code> in <code>ttype</code>. * * @param flag * <code>true</code> if EOL is significant, <code>false</code> * otherwise. */ public void eolIsSignificant(boolean flag) { isEOLSignificant = flag; } /** * Answer the current line number. * * @return the current line number. */ public int lineno() { return lineNumber; } /** * Set a boolean indicating whether or not tokens should be uppercased when * present in <code>sval</code>. * * @param flag * <code>true</code> if <code>sval</code> should be forced * uppercase, <code>false</code> otherwise. */ public void lowerCaseMode(boolean flag) { forceLowercase = flag; } /** * Answer the next token type. * * @return The next token to be parsed. * * @throws IOException * If an IO error occurs while getting the token */ public int nextToken() throws IOException { if (pushBackToken) { pushBackToken = false; if (ttype != TT_UNKNOWN) { return ttype; } } sval = null; // Always reset sval to null int currentChar = peekChar == -2 ? read() : peekChar; if (lastCr && currentChar == '\n') { lastCr = false; currentChar = read(); } if (currentChar == -1) { return (ttype = TT_EOF); } byte currentType = currentChar > 255 ? TOKEN_WORD : tokenTypes[currentChar]; while ((currentType & TOKEN_WHITE) != 0) { /** * Skip over white space until we hit a new line or a real token */ if (currentChar == '\r') { lineNumber++; if (isEOLSignificant) { lastCr = true; peekChar = -2; return (ttype = TT_EOL); } if ((currentChar = read()) == '\n') { currentChar = read(); } } else if (currentChar == '\n') { lineNumber++; if (isEOLSignificant) { peekChar = -2; return (ttype = TT_EOL); } currentChar = read(); } else { // Advance over this white space character and try again. currentChar = read(); } if (currentChar == -1) { return (ttype = TT_EOF); } currentType = currentChar > 255 ? TOKEN_WORD : tokenTypes[currentChar]; } /** * Check for digits before checking for words since digits can be * contained within words. */ if ((currentType & TOKEN_DIGIT) != 0) { StringBuffer digits = new StringBuffer(20); boolean haveDecimal = false, checkJustNegative = currentChar == '-'; while (true) { if (currentChar == '.') { haveDecimal = true; } digits.append((char) currentChar); currentChar = read(); if ((currentChar < '0' || currentChar > '9') && (haveDecimal || currentChar != '.')) { break; } } peekChar = currentChar; if (checkJustNegative && digits.length() == 1) { // Didn't get any other digits other than '-' return (ttype = '-'); } try { nval = Double.valueOf(digits.toString()).doubleValue(); } catch (NumberFormatException e) { // Unsure what to do, will write test. nval = 0; } return (ttype = TT_NUMBER); } // Check for words if ((currentType & TOKEN_WORD) != 0) { StringBuffer word = new StringBuffer(20); while (true) { word.append((char) currentChar); currentChar = read(); if (currentChar == -1 || (currentChar < 256 && (tokenTypes[currentChar] & (TOKEN_WORD | TOKEN_DIGIT)) == 0)) { break; } } peekChar = currentChar; sval = forceLowercase ? word.toString().toLowerCase() : word .toString(); return (ttype = TT_WORD); } // Check for quoted character if (currentType == TOKEN_QUOTE) { int matchQuote = currentChar; StringBuffer quoteString = new StringBuffer(); int peekOne = read(); while (peekOne >= 0 && peekOne != matchQuote && peekOne != '\r' && peekOne != '\n') { boolean readPeek = true; if (peekOne == '\\') { int c1 = read(); // Check for quoted octal IE: \377 if (c1 <= '7' && c1 >= '0') { int digitValue = c1 - '0'; c1 = read(); if (c1 > '7' || c1 < '0') { readPeek = false; } else { digitValue = digitValue * 8 + (c1 - '0'); c1 = read(); // limit the digit value to a byte if (digitValue > 037 || c1 > '7' || c1 < '0') { readPeek = false; } else { digitValue = digitValue * 8 + (c1 - '0'); } } if (!readPeek) { // We've consumed one to many quoteString.append((char) digitValue); peekOne = c1; } else { peekOne = digitValue; } } else { switch (c1) { case 'a': peekOne = 0x7; break; case 'b': peekOne = 0x8; break; case 'f': peekOne = 0xc; break; case 'n': peekOne = 0xA; break; case 'r': peekOne = 0xD; break; case 't': peekOne = 0x9; break; case 'v': peekOne = 0xB; break; default: peekOne = c1; } } } if (readPeek) { quoteString.append((char) peekOne); peekOne = read(); } } if (peekOne == matchQuote) { peekOne = read(); } peekChar = peekOne; ttype = matchQuote; sval = quoteString.toString(); return ttype; } // Do comments, both "//" and "/*stuff*/" if (currentChar == '/' && (slashSlashComments || slashStarComments)) { if ((currentChar = read()) == '*' && slashStarComments) { int peekOne = read(); while (true) { currentChar = peekOne; peekOne = read(); if (currentChar == -1) { peekChar = -1; return (ttype = TT_EOF); } if (currentChar == '\r') { if (peekOne == '\n') { peekOne = read(); } lineNumber++; } else if (currentChar == '\n') { lineNumber++; } else if (currentChar == '*' && peekOne == '/') { peekChar = read(); return nextToken(); } } } else if (currentChar == '/' && slashSlashComments) { // Skip to EOF or new line then return the next token while ((currentChar = read()) >= 0 && currentChar != '\r' && currentChar != '\n') { // Intentionally empty } peekChar = currentChar; return nextToken(); } else if (currentType != TOKEN_COMMENT) { // Was just a slash by itself peekChar = currentChar; return (ttype = '/'); } } // Check for comment character if (currentType == TOKEN_COMMENT) { // Skip to EOF or new line then return the next token while ((currentChar = read()) >= 0 && currentChar != '\r' && currentChar != '\n') { // Intentionally empty } peekChar = currentChar; return nextToken(); } peekChar = read(); return (ttype = currentChar); } /** * Set the character <code>ch</code> to be regarded as an ordinary * character. * * @param ch * The character to be considered an ordinary comment character. */ public void ordinaryChar(int ch) { if (0 <= ch && ch < tokenTypes.length) { tokenTypes[ch] = 0; } } /** * Set the characters ranging from <code>low</code> to <code>hi</code> * to be regarded as ordinary characters. * * @param low * The starting range for ordinary characters. * @param hi * The ending range for ordinary characters. */ public void ordinaryChars(int low, int hi) { if (low < 0) { low = 0; } if (hi > tokenTypes.length) { hi = tokenTypes.length - 1; } for (int i = low; i <= hi; i++) { tokenTypes[i] = 0; } } /** * Indicate that numbers should be parsed. */ public void parseNumbers() { for (int i = '0'; i <= '9'; i++) { tokenTypes[i] |= TOKEN_DIGIT; } tokenTypes['.'] |= TOKEN_DIGIT; tokenTypes['-'] |= TOKEN_DIGIT; } /** * Indicate that the current token should be pushed back and returned the * next time <code>nextToken()</code> is called. */ public void pushBack() { pushBackToken = true; } /** * Set the character <code>ch</code> to be regarded as a quote character. * * @param ch * The character to be considered a quote comment character. */ public void quoteChar(int ch) { if (0 <= ch && ch < tokenTypes.length) { tokenTypes[ch] = TOKEN_QUOTE; } } private int read() throws IOException { // Call the read for the appropriate stream if (inStream == null) { return inReader.read(); } return inStream.read(); } /** * Reset all characters so that they are ordinary. */ public void resetSyntax() { for (int i = 0; i < 256; i++) { tokenTypes[i] = 0; } } /** * Set a boolean indicating whether or not slash slash comments should be * recognized. The comment ends at a new line. * * @param flag * <code>true</code> if <code>//</code> should be recognized * as the start of a comment, <code>false</code> otherwise. */ public void slashSlashComments(boolean flag) { slashSlashComments = flag; } /** * Set a boolean indicating whether or not slash star comments should be * recognized. Slash-star comments cannot be nested and end when a * star-slash combination is found. * * @param flag * <code>true</code> if <code>/*</code> should be recognized * as the start of a comment, <code>false</code> otherwise. */ public void slashStarComments(boolean flag) { slashStarComments = flag; } /** * Answer the state of this tokenizer in a readable format. * * @return The current state of this tokenizer. */ public String toString() { // Values determined through experimentation StringBuffer result = new StringBuffer(); result.append("Token["); //$NON-NLS-1$ switch (ttype) { case TT_EOF: result.append("EOF"); //$NON-NLS-1$ break; case TT_EOL: result.append("EOL"); //$NON-NLS-1$ break; case TT_NUMBER: result.append("n="); //$NON-NLS-1$ result.append(nval); break; case TT_WORD: result.append(sval); break; default: if (ttype == TT_UNKNOWN || tokenTypes[ttype] == TOKEN_QUOTE) { result.append(sval); } else { result.append('\''); result.append((char) ttype); result.append('\''); } } result.append("], line "); //$NON-NLS-1$ result.append(lineNumber); return result.toString(); } /** * Set the characters ranging from <code>low</code> to <code>hi</code> * to be regarded as whitespace characters. * * @param low * The starting range for whitespace characters. * @param hi * The ending range for whitespace characters. */ public void whitespaceChars(int low, int hi) { if (low < 0) { low = 0; } if (hi > tokenTypes.length) { hi = tokenTypes.length - 1; } for (int i = low; i <= hi; i++) { tokenTypes[i] = TOKEN_WHITE; } } /** * Set the characters ranging from <code>low</code> to <code>hi</code> * to be regarded as word characters. * * @param low * The starting range for word characters. * @param hi * The ending range for word characters. */ public void wordChars(int low, int hi) { if (low < 0) { low = 0; } if (hi > tokenTypes.length) { hi = tokenTypes.length - 1; } for (int i = low; i <= hi; i++) { tokenTypes[i] |= TOKEN_WORD; } } }