/*********************************************************************** * @(#)$RCSfile: Tokenizer.java,v $ $Revision: 1.6 $$Date: 2006/04/21 14:14:56 $ * * Copyright (c) Christof Dallermassl * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License (LGPL) * as published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, write to the * Free Software Foundation, Inc., * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ***********************************************************************/ package org.dinopolis.util.io; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PushbackReader; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.Iterator; import java.util.List; //---------------------------------------------------------------------- /** * This tokenizer merges the benefits of the java.lang.StringTokenizer * class and the java.io.StreamTokenizer class. It provides a low * level and a high level interface to the tokenizer. The low level * interface consists of the method pair nextToken() and getWord(), * where the first returns the type of token in the parsing process, * and the latter returns the String element itself. * <p> * The high level interface consists of the methods hasNextLine() and * nextLine(). They use the low level interface to parse the data line * by line and create a list of strings from it. * <p> * It is unsure, if it is wise to mix the usage of the high and * the low level interface. For normal usage, the high level interface * should be more comfortable to use and does not provide any * drawbacks. * <p> * An example for the high level interface: * <pre> * try * { * // simple example, tokenizing string, no escape, but quoted * // works: * System.out.println("example 1"); * Tokenizer tokenizer = new Tokenizer("text,,,\"another,text\""); * List tokens; * while(tokenizer.hasNextLine()) * { * tokens = tokenizer.nextLine(); * System.out.println(tokens.get(0)); // prints 'text' * System.out.println(tokens.get(1)); // prints '' * System.out.println(tokens.get(2)); // prints '' * System.out.println(tokens.get(3)); // prints 'another,text' * } * * System.out.println("example 2"); * // simple example, tokenizing string, using escape char and * // quoted strings: * tokenizer = new Tokenizer("text,text with\\,comma,,\"another,text\""); * tokenizer.respectEscapedCharacters(true); * while(tokenizer.hasNextLine()) * { * tokens = tokenizer.nextLine(); * System.out.println(tokens.get(0)); // prints 'text' * System.out.println(tokens.get(1)); // prints 'text with, comma' * System.out.println(tokens.get(2)); // prints '' * System.out.println(tokens.get(3)); // prints 'another,text' * } * } * catch(Exception ioe) * { * ioe.printStackTrace(); * } * </pre> * <p> * The advantages compared to the StreamTokenizer class are: Unlike * the StreamTokenizer, this Tokenizer class returns the delimiters as * tokens and therefore may be used to tokenize e.g. comma separated * files with empty fields (the StreamTokenizer handles multiple * delimiters in a row like one delimiter). * <p> * The tokenizer respect quoted words, so the delimiter is ignored if * inside quotes. And it may handle escaped characters (like an * escaped quote character, or an escaped new line). So the line * <code>eric,"he said, \"great!\""</code> returns <code>eric</code> * and <code>he said, "great!"</code> as words. * <p> * Low level interface: The design of the Tokenizer allows to get * empty columns as well as treat multiple delimiters in a row as one * delimiter. For the first approach trigger the values on every * DELIMITER and EOF token whereas for the second, trigger only on * WORD tokens. * <p> * If one wants to be informed about empty words as well, use the * Tokenizer like in the following code fragment: * <pre> * Tokenizer tokenizer = new Tokenizer("text,,,another text"); * String word = ""; * int token; * while((token = tokenizer.nextToken()) != Tokenizer.EOF) * { * switch(token) * { * case Tokenizer.EOL: * System.out.println("word: "+word); * word = ""; * System.out.println("-------------"); * break; * case Tokenizer.WORD: * word = tokenizer.getWord(); * break; * case Tokenizer.QUOTED_WORD: * word = tokenizer.getWord() + " (quoted)"; * break; * case Tokenizer.DELIMITER: * System.out.println("word: "+word); * word = ""; * break; * default: * System.err.println("Unknown Token: "+token); * } * } * </pre> * In this example, if the delimiter is set to a comma, a line like * <code>column1,,,"column4,partofcolumn4"</code> would be treated correctly. * <p> * This tokenizer uses the LF character as end of line characters. It * ignores any CR characters, so it can be used in windows * environments as well. * * @author Christof Dallermassl * @version $Revision: 1.6 $ */ public class Tokenizer { /** the reader to read from */ protected PushbackReader reader_; /** the buffer to create the tokens */ protected StringBuffer buffer_; /** all characters in this string are used as delimiters */ protected String delimiters_ = ","; /** the escape character */ protected int escapeChar_ = '\\'; /** the quote character */ protected int quoteChar_ = '"'; /** if true, characters are treated as escaped */ protected boolean escapeMode_ = false; /** if true, end of line is respected */ protected boolean eolIsSignificant_ = true; /** if true, escape characters are respected */ protected boolean respectEscapedChars_ = false; /** if true, quoted words are respected */ protected boolean respectQuotedWords_ = true; /** line count */ protected int lineCount_ = 1; /** end of file marker */ protected boolean eofReached_ = false; /** the last token that was found */ protected int lastToken_ = NOT_STARTED; /** end of file token */ public static final int EOF = -1; /** end of line token */ public static final int EOL = 0; /** word token */ public static final int WORD = 1; /** quoted word token */ public static final int QUOTED_WORD = 2; /** delimiter token */ public static final int DELIMITER = 3; /** error token */ public static final int ERROR = 4; /** not started token */ public static final int NOT_STARTED = 5; //---------------------------------------------------------------------- /** * Creates a tokenizer that reads from the given string. It uses the * comma as delimiter, does not respect escape characters but respects * quoted words. * * @param string the string to read from. */ public Tokenizer(String string) { this(new StringReader(string)); } //---------------------------------------------------------------------- /** * Creates a tokenizer that reads from the given string. All * characters in the given delimiters string are used as * delimiter. The tokenizer does not respect escape characters but * respects quoted words. * * @param string the string to read from. * @param delimiters the delimiters to use. */ public Tokenizer(String string, String delimiters) { this(new StringReader(string)); setDelimiters(delimiters); } //---------------------------------------------------------------------- /** * Creates a tokenizer that reads from the given string. It uses the * comma as delimiter, does not respect escape characters but respects * quoted words. * * @param inStream the stream to read from. */ public Tokenizer(InputStream inStream) { this(new InputStreamReader(inStream)); } //---------------------------------------------------------------------- /** * Creates a tokenizer that reads from the given reader. It uses the * comma as delimiter, does not respect escape characters but respects * quoted words. * * @param reader the reader to read from. */ public Tokenizer(Reader reader) { reader_ = new PushbackReader(reader,2); buffer_ = new StringBuffer(); } //---------------------------------------------------------------------- /** * Set the delimiter character. The default is the comma. * * @param delimiterChar the delimiter character. */ public void setDelimiter(int delimiterChar) { delimiters_ = new String(new char[]{(char)delimiterChar}); } //---------------------------------------------------------------------- /** * Get the first delimiter character. * * @return the delimiter character. * @deprecated use the getDelimiters() method now */ public int getDelimiter() { return(delimiters_.charAt(0)); } //---------------------------------------------------------------------- /** * Set the delimiter characters. All characters in the delimiters are * used as delimiter. * * @param delimiters the delimiter characters. */ public void setDelimiters(String delimiters) { delimiters_ = delimiters; } //---------------------------------------------------------------------- /** * Get the delimiter character. * * @return the delimiter character. */ public String getDelimiters() { return(delimiters_); } //---------------------------------------------------------------------- /** * Set the escape character. The default is the backslash. * * @param escapeChar the escape character. */ public void setEscapeChar(int escapeChar) { escapeChar_ = escapeChar; } //---------------------------------------------------------------------- /** * Get the escape character. * * @return the escape character. */ public int getEscapeChar() { return(escapeChar_); } //---------------------------------------------------------------------- /** * If escape characters should be respected, set the param to * <code>true</code>. The default is to ignore escape characters. * * @param respectEscaped If escape characters should be respected, * set the param to <code>true</code>. */ public void respectEscapedCharacters(boolean respectEscaped) { respectEscapedChars_ = respectEscaped; } //---------------------------------------------------------------------- /** * Returns <code>true</code>, if escape character is respected. * * @return <code>true</code>, if escape character is respected. */ public boolean respectEscapedCharacters() { return(respectEscapedChars_); } //---------------------------------------------------------------------- /** * Get the quote character. * * @return the quote character. */ public int getQuoteChar() { return (quoteChar_); } //---------------------------------------------------------------------- /** * Set the quote character. The default is the double quote. * * @param quoteChar the quote character. */ public void setQuoteChar(int quoteChar) { quoteChar_ = quoteChar; } //---------------------------------------------------------------------- /** * If quoted words should be respected, set the param to * <code>true</code>. The default is to respect quoted words. * * @param respectQuotes If quoted words should be respected, * set the param to <code>true</code>. */ public void respectQuotedWords(boolean respectQuotes) { respectQuotedWords_ = respectQuotes; } //---------------------------------------------------------------------- /** * Returns <code>true</code>, if quoted words are respected. * * @return <code>true</code>, if quoted words are respected. */ public boolean respectQuotedWords() { return(respectQuotedWords_); } //---------------------------------------------------------------------- /** * If set to <code>true</code> the end of line is signaled by the EOL * token. If set to <code>false</code> end of line is treated as a * normal delimiter. The default value is true; * * @param significant if the end of line is treated as a special token * or as a delimiter. */ public void eolIsSignificant(boolean significant) { eolIsSignificant_ = significant; } //---------------------------------------------------------------------- /** * Returns <code>true</code>, if in case of an end of line detected, * an EOL token is returned. If <code>false</code>, the end of line is * treated as a normal delimiter. * * @return <code>true</code>, if in case of an end of line detected, * an EOL token is returned. If <code>false</code>, the end of line is * treated as a normal delimiter. */ public boolean isEolSignificant() { return(eolIsSignificant_); } //---------------------------------------------------------------------- /** * Returns the current line number of the reader. * * @return the current line number of the reader. */ public int getLineNumber() { return(lineCount_); } //---------------------------------------------------------------------- /** * Returns the value of the token. If the token was of the type WORD, * the word is returned. * * @return the value of the token. */ public String getWord() { return(buffer_.toString()); } //---------------------------------------------------------------------- /** * Returns the last token that was returned from the nextToken() method. * * @return the last token. */ public int getLastToken() { return(lastToken_); } //---------------------------------------------------------------------- /** * Returns true, if the given character is seen as a delimiter. This * method respects escape_mode, so if the escape character was found * before, it has to act accordingly (usually, return false, even if * the character is a delimiter). * * @param character the character to check for delimiter * @return true, if the given character is seen as a delimiter. */ protected boolean isDelimiter(int character) { // check for escape mode: if(escapeMode_) return(false); return(delimiters_.indexOf(character) >= 0); } //---------------------------------------------------------------------- /** * Returns true, if the given character is seen as a quote * character. This method respects escape_mode, so if the escape * character was found before, it has to act accordingly (usually, * return false, even if the character is a quote character). * * @param character the character to check for quote. * @return true, if the given character is seen as a quote character. */ protected boolean isQuoteChar(int character) { if(!respectQuotedWords_) return(false); // check for escape mode: if(escapeMode_) return(false); return(character == quoteChar_); } //---------------------------------------------------------------------- /** * Returns true, if the given character is seen as a escape * character. This method respects escape_mode, so if the escape * character was found before, it has to act accordingly (usually, * return false, even if the character is a escape character). * @param character the character to check for escape character. * @return true, if the given character is seen as a escape character. */ protected boolean isEscapeChar(int character) { if(!respectEscapedChars_) return(false); // check for escape mode: if(escapeMode_) return(false); return(character == escapeChar_); } //---------------------------------------------------------------------- /** * Returns true, if the given character is seen as a end of line * character. This method respects end of line_mode, so if the end of * line character was found before, it has to act accordingly * (usually, return false, even if the character is a end of line * character). * @param character the character to check for end of line. * @return true, if the given character is seen as a end of line * character. */ protected boolean isEndOfLine(int character) { // check for escape mode: if(escapeMode_) { if(character == '\n') // add line count, even if in escape mode! lineCount_++; return(false); } if(character == -1) eofReached_ = true; return((character=='\n') || (character=='\r') || (character == -1)); } //---------------------------------------------------------------------- /** * Closes the tokenizer (and the reader is uses internally). * * @exception IOException if an error occurred. */ public void close() throws IOException { reader_.close(); } //---------------------------------------------------------------------- /** * Reads and returns the next character from the reader and checks for * the escape character. If an escape character is read, a flag is set * and the next character is read. A newline following the escape * character is ignored. * * @return the next character. * @exception IOException if an error occurred. */ protected int readNextChar() throws IOException { int next_char = reader_.read(); if(escapeMode_) { escapeMode_ = false; } else { if(isEscapeChar(next_char)) { // ignore escape char itself: next_char = reader_.read(); // check for newline and ignore it: if(isEndOfLine(next_char)) { lineCount_++; next_char = reader_.read(); // ignore CR: if(next_char == '\r') { next_char = readNextChar(); } } escapeMode_ = true; } } // ignore CR: if(next_char == '\r') { next_char = readNextChar(); } return(next_char); } //---------------------------------------------------------------------- /** * Returns the next token from the reader. The token's value may be * WORD, QUOTED_WORD, EOF, EOL, or DELIMITER. In the case or WORD or * QUOTED_WORD the actual word can be obtained by the use of the * getWord method. * * @return the next token. * @exception IOException if an error occurred. */ public int nextToken() throws IOException { buffer_.setLength(0); int next_char; next_char = readNextChar(); // handle EOF: if(eofReached_) { lastToken_ = EOF; return(EOF); } // handle EOL: if(isEndOfLine(next_char)) { lineCount_++; if(eolIsSignificant_) { lastToken_ = EOL; return(EOL); } else { lastToken_ = DELIMITER; return(DELIMITER); } } // handle DELIMITER if(isDelimiter(next_char)) { lastToken_ = DELIMITER; return(DELIMITER); } // handle quoted words: if(isQuoteChar(next_char)) { while(true) { next_char = readNextChar(); if(isEndOfLine(next_char)) { lastToken_ = ERROR; return(ERROR); } else { if(isQuoteChar(next_char)) { lastToken_ = QUOTED_WORD; return(QUOTED_WORD); } // no special char, then append to buffer: buffer_.append((char)next_char); } } } // handle 'normal' words: while(true) { buffer_.append((char)next_char); next_char = readNextChar(); if(isDelimiter(next_char) || isEndOfLine(next_char)) { reader_.unread(next_char); lastToken_ = WORD; return(WORD); } } } //---------------------------------------------------------------------- /** * Returns true, if the tokenizer can return another line. * * @return true, if the tokenizer can return another line. * @exception IOException if an error occurred. */ public boolean hasNextLine() throws IOException { if(lastToken_ == EOF) return(false); if((lastToken_ == EOL) || (lastToken_ == NOT_STARTED)) { int next_char = readNextChar(); if(next_char == -1) return(false); reader_.unread(next_char); } return(true); } //---------------------------------------------------------------------- /** * Returns a list of elements (Strings) from the next line of the * tokenizer. If there are multiple delimiters without any values in * between, empty (zero length) strings are added to the list. They * may be removed by the use of the {@link * #removeZeroLengthElements(List)} method. * * @return a list of elements (Strings) from the next line of the * tokenizer. * @exception IOException if an error occurred. */ public List<String> nextLine() throws IOException { int token = nextToken(); List<String> list = new ArrayList<>(); String word = ""; // while(token != Tokenizer.EOF) while(true) { switch(token) { case Tokenizer.WORD: word = getWord(); break; case Tokenizer.QUOTED_WORD: word = getWord(); break; case Tokenizer.DELIMITER: list.add(word); word = ""; break; case Tokenizer.EOL: case Tokenizer.EOF: list.add(word); return(list); default: System.err.println("Unknown Token: "+token); } token = nextToken(); } // return(list); } //---------------------------------------------------------------------- /** * This helper method removes all zero length elements from the given * list and returns it. The given list is not changed! * * @param list the list of String objects to remove the zero elements from. * @return a copy of the given list where all zero length elements are removed. */ public static List<String> removeZeroLengthElements(List<String> list) { return removeZeroLengthElements(list, false); } //---------------------------------------------------------------------- /** * This helper method trims all elements and removes all zero length * (length is taken after trimming leading and trailing spaces) elements from the given * list and returns it. This method copies the (trimmed and) non-zero elements to a * new list. * * @param list the list of String objects to remove the zero elements from. * @param trim if set to <code>true</code>, all leading and trailing spaces are removed from * the elements. This is done, before the length is compared to zero (and the element * may be removed if the length is zero). If set to <code>true</code>, elements * that only consist of spaces are removed as well! * @return the list where all zero length elements are remove. */ public static List<String> removeZeroLengthElements(List<String> list, boolean trim) { Iterator<String> iterator = list.iterator(); String value; List<String> new_list = new ArrayList<>(); while(iterator.hasNext()) { value = iterator.next(); if (trim) value = value.trim(); if(value.length() != 0) new_list.add(value); } return(new_list); } }