/* Copyright (C) 2006 Christian Schneider
*
* This file is part of Nomad.
*
* Nomad is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* Nomad is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Nomad; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* Created on Dec 19, 2006
*/
package net.sf.nmedit.jpatch.clavia.nordmodular.parser;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
/**
* Lexer for Nord Modular 3 Patch files.
*
* @author Christian Schneider
*/
public final class PScanner
{
// white space characters
// HORIZONTAL TABULATION
public static final int C_WS_HT = '\u0009';
// LINE FEED \u000A
public static final int C_WS_LF = 10;
// VERTICAL TABULATION
public static final int C_WS_VT = '\u000B';
// FORM FEED
public static final int C_WS_FF = '\u000C';
// CARRIAGE RETURN \u000D
public static final int C_WS_CR = 13;
// FILE SEPARATOR
public static final int C_WS_FS = '\u001C';
// GROUP SEPARATOR
public static final int C_WS_GS = '\u001D';
// RECORD SEPARATOR
public static final int C_WS_RS = '\u001E';
// UNIT SEPARATOR
public static final int C_WS_US = '\u001F';
// SPACE
public static final int C_WS_SP = '\u0020';
// end of file token / internal end of file character
public static final int EOF = -1;
// open bracket token
public static final int BROPEN = '[';
// close bracket token
public static final int BRCLOSE = ']';
// slash token
public static final int SLASH = '/';
// equal token
public static final int EQ = '=';
// token classes
public static final int BASE = 1000;
// inline whitespace
public static final int INLINEWS = BASE+0;
// newline characters \n,\r
public static final int NEWLINEWS = BASE+1;
// number token '0' | ('-'? [1..9] [0..9]+ )
public static final int NUMBER = BASE+2;
// anything else - string token
public static final int ANY = BASE+3;
private final static int TAKEN = EOF;
// the reader
private Reader reader;
// buffer for string tokens
private StringBuilder sbuffer;
// buffer for numbers
private int ibuffer;
// current character
private int cbuf;
// current line
private int line;
// current position
private int position;
// start of the newline
private int newlineposition;
public PScanner()
{
sbuffer = new StringBuilder();
}
/**
* Creates a new lexer that reads from the specfied input stream.
* @param stream the source
*/
public PScanner(InputStream stream)
{
this();
setSource(stream);
}
/**
* A lexer that reads from the specified reader
* @param reader the source
*/
public PScanner(Reader reader)
{
this();
setSource(reader);
}
public void setSource(InputStream stream)
{
try
{
Charset c = Charset.forName("ISO-8859-1");
setSource(new InputStreamReader(stream, c));
return;
}
catch (IllegalCharsetNameException e)
{
}
catch (UnsupportedCharsetException e)
{
}
setSource(new InputStreamReader(stream));
}
public void setSource(Reader reader)
{
this.reader = reader;
sbuffer.setLength(0);
line = 1;
newlineposition = 0;
position = 0;
ibuffer = 0;
take();
}
public final int getPosition()
{
return position;
}
/**
* Returns the current line number.
* @return the current line number
*/
public final int getLineNumber()
{
return line;
}
/**
* Returns the current column number
* @return the current column number
*/
public final int getColumn()
{
return position-newlineposition;
}
/**
* Returns the current string token.
*
* Any token except the number token
* and the newline whitespace token
* have a string value. For other tokens
* the return value is undefined.
*
* @return the current string token
*/
public final String getString()
{
return sbuffer.toString();
}
/**
* Returns the current number token or
* the number of newlines if the current token
* is a newline whitespace token.
*
* For any other token the return value is undefined.
*
* @return the current number token or the number of newlines
*/
public final int getNumber()
{
return ibuffer;
}
/**
* Returns the next token in the stream.
*
* Defined tokens are
* <ul>
* <li>EOF - if the end of the file was reached</li>
* <li>BROPEN - the '[' character was read</li>
* <li>BRCLOSE - the ']' character was read</li>
* <li>SLASH - the '/' character was read</li>
* <li>EQ - the '=' character was read</li>
* <li>INLINEWS - whitespace characters except of '\r' '\n' were read</li>
* <li>NEWLINE - the expression ('\r'|'\n')+ was read</li>
* <li>NUMBER - a number was read</li>
* <li>ANY - any other character(s) / string token</li>
* </ul>
*
* The value of any token except of NUMBER and NEWLINE tokens
* is returned by {@link #getString()}.
*
* The number in a NUMBER token is returned by {@link #getNumber()}
*
* The number of newlines in a NEWLINE token is
* returned by {@link #getNumber()}.
*
* @return
* @throws IOException
*/
public final int nextToken() throws IOException
{
sbuffer.setLength(0);
ibuffer = 0;
int tmpToken;
switch (next())
{
case'-':
case'0':case'1':case'2':case'3':case'4':
case'5':case'6':case'7':case'8':case '9':
return number();
case'[':case']':case'/':case'=':
tmpToken = cbuf;
appendAndTake();
return tmpToken;
case C_WS_HT:case C_WS_VT:case C_WS_FF:case C_WS_FS:
case C_WS_GS:case C_WS_RS:case C_WS_US:case C_WS_SP:
for(;;) {
appendAndTake();
switch (next())
{
case C_WS_HT:case C_WS_VT:case C_WS_FF:case C_WS_FS:
case C_WS_GS:case C_WS_RS:case C_WS_US:case C_WS_SP:
break;
default:
return INLINEWS;
}
}
case C_WS_CR:case C_WS_LF:
return newlinews();
case EOF:
return EOF;
default:
return any();
}
}
/**
* marks the current character as read
*/
private final void take()
{
cbuf = TAKEN;
}
/**
* Returns the next character.
*
* A new character is only read from the stream if {@link #take()}
* has been called before.
*
* @return the next character
* @throws IOException reading from the stream failed
*/
private final int next() throws IOException
{
if (cbuf!=TAKEN)
return cbuf;
cbuf = reader.read();
if (cbuf!=TAKEN) position++;
return cbuf;
}
/**
* Appends the current character to the string buffer and
* marks the character as read by calling {@link #take()}
*/
private final void appendAndTake()
{
sbuffer.append((char)cbuf);
cbuf = TAKEN;
}
/**
* Reads (\r|\n)+
*
* @return NEWLINEWS token
* @throws IOException
*/
private final int newlinews() throws IOException
{
loop:
for (boolean skipLF = false;;take())
{
if (skipLF)
{
skipLF = false;
if (next() == C_WS_LF)
continue;
}
switch (next())
{
case C_WS_CR: skipLF = true; // fall through
case C_WS_LF: ibuffer++; break;
default: break loop;
}
}
line+=ibuffer;
newlineposition = position;
return NEWLINEWS;
}
private final int abortNumber(int charCount, boolean sign) throws IOException
{
if (charCount == 0)
return any();
if (sign)
{
sbuffer.append('-');
if (charCount == 1)
return any();
}
sbuffer.append(Integer.toString(ibuffer));
return any();
}
/**
* Reads the next number or string.
*
* If the next characters match the regular expression
* <code>'0' | ('-'? [1..9] [0..9]+ )</code> and the
* expression is followed by a whitespace or end of file
* character then the NUMBER token is returned. Otherwise
* the ANY token is returned.
*
* @return the next number or string
* @throws IOException
*/
private final int number() throws IOException
{
/*
* for testing
* 0 1 -0 -1
* -00 -01 -10 -11
* 0-0 0-1 1-0 1-1
* 00- 01- 10- 11-
* 0--0 0--1 1--0 1--1
*/
boolean sign = false;
final int first = getPosition();
final int second = first+1;
// read the number
loop:for(;;)
{
switch (next())
{
case'0':
if (getPosition() == second)
{
if (sign)
{
// -0
take();
return abortNumber(getPosition()-first, sign);
}
if (ibuffer==0)
{
// 00
return abortNumber(getPosition()-first, sign);
}
}
// fall down
case'1':case'2':case'3':case'4':
case'5':case'6':case'7':case'8':case '9':
if (getPosition()>first && ibuffer==0 && !sign)
{
return abortNumber(getPosition()-first, sign);
}
int newbuffer = (ibuffer*10)+(cbuf-'0');
// check for overflow (not a number)
if (newbuffer<ibuffer) return abortNumber(getPosition()-first, sign);
// no overflow
ibuffer = newbuffer;
break;
case '[': case EOF: case C_WS_CR:case C_WS_LF:
case C_WS_HT:case C_WS_VT:case C_WS_FF:case C_WS_FS:
case C_WS_GS:case C_WS_RS:case C_WS_US:case C_WS_SP:
// whitespace|eof|'['
break loop;
case '-':
// _, --, [0-9]-
if (getPosition() == first)
{
sign = true;
break;
}
// fall down
default:
// not a number
return abortNumber(getPosition()-first, sign);
}
take();
}
if (getPosition()-first == 1 && sign)
{
return abortNumber(getPosition()-first, sign);
}
if (sign) ibuffer = -ibuffer;
return NUMBER;
}
/**
* Matches any character except whitespace, end of file
* or one of the special characters '[', '], '/', '='
*
* @return ANY
* @throws IOException
*/
private final int any() throws IOException
{
for(;;)
{
switch (next())
{
case EOF:
case C_WS_CR:case C_WS_LF:
case C_WS_HT:case C_WS_VT:case C_WS_FF:case C_WS_FS:
case C_WS_GS:case C_WS_RS:case C_WS_US:case C_WS_SP:
case '[':case ']':case '/':case '=':
return ANY;
default:
appendAndTake();
// split strings which become too long
if (sbuffer.length()>=100)
return ANY;
break;
}
}
}
}