/*
* file: Tokenizer.java
* author: Jon Iles
* copyright: (c) Packwood Software 2002-2003
* date: 03/01/2003
*/
/*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the
* Free Software Foundation; either version 2.1 of the License, or (at your
* option) any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
* License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library; if not, write to the Free Software Foundation, Inc.,
* 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
*/
package net.sf.mpxj.common;
import java.io.IOException;
/**
* This class implements a tokenizer based loosely on
* java.io.StreamTokenizer. This tokenizer is designed to parse records from
* an MPX file correctly. In particular it will handle empty fields,
* represented by adjacent field delimiters.
*/
public abstract class Tokenizer
{
/**
* This method must be implemented to read the next character from the
* data source.
*
* @return next character
* @throws IOException
*/
protected abstract int read() throws IOException;
/**
* This method retrieves the next token and returns a constant representing
* the type of token found.
*
* @return token type value
*/
public int nextToken() throws IOException
{
int c;
int nextc = -1;
boolean quoted = false;
int result = m_next;
if (m_next != 0)
{
m_next = 0;
}
m_buffer.setLength(0);
while (result == 0)
{
if (nextc != -1)
{
c = nextc;
nextc = -1;
}
else
{
c = read();
}
switch (c)
{
case TT_EOF:
{
if (m_buffer.length() != 0)
{
result = TT_WORD;
m_next = TT_EOF;
}
else
{
result = TT_EOF;
}
break;
}
case TT_EOL:
{
int length = m_buffer.length();
if (length != 0 && m_buffer.charAt(length - 1) == '\r')
{
--length;
m_buffer.setLength(length);
}
if (length == 0)
{
result = TT_EOL;
}
else
{
result = TT_WORD;
m_next = TT_EOL;
}
break;
}
default:
{
if (c == m_quote)
{
if (quoted == false && startQuotedIsValid(m_buffer))
{
quoted = true;
}
else
{
if (quoted == false)
{
m_buffer.append((char) c);
}
else
{
nextc = read();
if (nextc == m_quote)
{
m_buffer.append((char) c);
nextc = -1;
}
else
{
quoted = false;
}
}
}
}
else
{
if (c == m_delimiter && quoted == false)
{
result = TT_WORD;
}
else
{
m_buffer.append((char) c);
}
}
}
}
}
m_type = result;
return (result);
}
/**
* This method allows us to control the behaviour of the tokenizer for
* quoted text. Normally quoted text begins with a quote character
* at the first position within a field. As this method is protected,
* sub classes can alter this behaviour if required.
*
* @param buffer the field contents read so far
* @return true if it is valid to treat the subsequent text as quoted
*/
protected boolean startQuotedIsValid(StringBuilder buffer)
{
return buffer.length() == 0;
}
/**
* This method retrieves the text of the last token found.
*
* @return last token text
*/
public String getToken()
{
return (m_buffer.toString());
}
/**
* This method retrieves the type of the last token found.
*
* @return last token type
*/
public int getType()
{
return (m_type);
}
/**
* This method is used to set the delimiter character recognised
* by the tokenizer.
*
* @param delimiter delimiter character
*/
public void setDelimiter(char delimiter)
{
m_delimiter = delimiter;
}
public static final int TT_EOL = '\n';
public static final int TT_EOF = -1;
public static final int TT_WORD = -3;
private char m_quote = '"';
private char m_delimiter = ',';
private int m_next;
private int m_type;
private StringBuilder m_buffer = new StringBuilder();
}