PScanner.java example

Explorer
nmedit-master
/* Copyright (C) 2006 Christian Schneider
 * 
 * This file is part of Nomad.
 * 
 * Nomad is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * Nomad is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with Nomad; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

/*
 * Created on Dec 19, 2006
 */
package net.sf.nmedit.jpatch.clavia.nordmodular.parser;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;

/**
 * Lexer for Nord Modular 3 Patch files.
 * 
 * @author Christian Schneider
 */
public final class PScanner
{
    
    // white space characters
    
    // HORIZONTAL TABULATION
    public static final int C_WS_HT = '\u0009';
    // LINE FEED \u000A
    public static final int C_WS_LF = 10;
    // VERTICAL TABULATION
    public static final int C_WS_VT = '\u000B';
    // FORM FEED
    public static final int C_WS_FF = '\u000C'; 
    // CARRIAGE RETURN \u000D
    public static final int C_WS_CR = 13;
    // FILE SEPARATOR
    public static final int C_WS_FS = '\u001C';
    // GROUP SEPARATOR
    public static final int C_WS_GS = '\u001D';
    // RECORD SEPARATOR
    public static final int C_WS_RS = '\u001E';
    // UNIT SEPARATOR
    public static final int C_WS_US = '\u001F';
    // SPACE
    public static final int C_WS_SP = '\u0020'; 

    // end of file token / internal end of file character
    public static final int EOF = -1;
    // open bracket token
    public static final int BROPEN  = '[';
    // close bracket token
    public static final int BRCLOSE = ']';
    // slash token
    public static final int SLASH   = '/';
    // equal token
    public static final int EQ      = '=';

    // token classes
    public static final int BASE      = 1000;
    // inline whitespace 
    public static final int INLINEWS  = BASE+0;
    // newline characters \n,\r
    public static final int NEWLINEWS = BASE+1;
    // number token '0' | ('-'? [1..9] [0..9]+ )
    public static final int NUMBER    = BASE+2;
    // anything else - string token
    public static final int ANY       = BASE+3;
    
    private final static int TAKEN = EOF;
    
    // the reader
    private Reader reader;
    // buffer for string tokens
    private StringBuilder sbuffer;
    // buffer for numbers
    private int ibuffer;
    // current character
    private int cbuf;
    // current line
    private int line;
    // current position
    private int position;
    // start of the newline
    private int newlineposition;

    public PScanner()
    {
        sbuffer = new StringBuilder();
    }
    
    /**
     * Creates a new lexer that reads from the specfied input stream.
     * @param stream the source
     */
    public PScanner(InputStream stream)
    {
        this();
        setSource(stream);
    }
    
    /**
     * A lexer that reads from the specified reader
     * @param reader the source
     */
    public PScanner(Reader reader)
    {
        this();
        setSource(reader);
    }
    
    public void setSource(InputStream stream)
    {
        try
        {
            Charset c = Charset.forName("ISO-8859-1");
            setSource(new InputStreamReader(stream, c));
            return;
        }
        catch (IllegalCharsetNameException e)
        {
            
        }
        catch (UnsupportedCharsetException e)
        {
            
        }
        
        setSource(new InputStreamReader(stream));
    }
       
    public void setSource(Reader reader)
    {
        this.reader = reader;
        sbuffer.setLength(0);
        line = 1;
        newlineposition = 0;
        position = 0;
        ibuffer = 0;
        take();
    }

    public final int getPosition()
    {
        return position;
    }
    
    /**
     * Returns the current line number.
     * @return the current line number
     */
    public final int getLineNumber()
    {
        return line;
    }
    
    /**
     * Returns the current column number
     * @return the current column number
     */
    public final int getColumn()
    {
        return position-newlineposition;
    }
    
    /**
     * Returns the current string token.
     * 
     * Any token except the number token
     * and the newline whitespace token
     * have a string value. For other tokens
     * the return value is undefined.
     * 
     * @return the current string token
     */
    public final String getString()
    {
        return sbuffer.toString();
    }
    
    /**
     * Returns the current number token or
     * the number of newlines if the current token
     * is a newline whitespace token.
     * 
     * For any other token the return value is undefined.
     * 
     * @return the current number token or the number of newlines
     */
    public final int getNumber()
    {
        return ibuffer;
    }

    /**
     * Returns the next token in the stream.
     * 
     * Defined tokens are
     * <ul>
     * <li>EOF - if the end of the file was reached</li>
     * <li>BROPEN - the '[' character was read</li>
     * <li>BRCLOSE - the ']' character was read</li>
     * <li>SLASH - the '/' character was read</li>
     * <li>EQ - the '=' character was read</li>
     * <li>INLINEWS - whitespace characters except of '\r' '\n' were read</li>
     * <li>NEWLINE - the expression ('\r'|'\n')+ was read</li>
     * <li>NUMBER - a number was read</li>
     * <li>ANY - any other character(s) / string token</li>
     * </ul>
     *
     * The value of any token except of NUMBER and NEWLINE tokens
     * is returned by {@link #getString()}.
     * 
     * The number in a NUMBER token is returned by {@link #getNumber()}
     * 
     * The number of newlines in a NEWLINE token is
     * returned by {@link #getNumber()}. 
     * 
     * @return
     * @throws IOException
     */
    public final int nextToken() throws IOException
    {
        sbuffer.setLength(0);
        ibuffer = 0;
        int tmpToken;

        switch (next())
        {
            case'-':
            case'0':case'1':case'2':case'3':case'4':
            case'5':case'6':case'7':case'8':case '9':
                return number();
            case'[':case']':case'/':case'=':
                tmpToken = cbuf;
                appendAndTake();
                return tmpToken;
            case C_WS_HT:case C_WS_VT:case C_WS_FF:case C_WS_FS:
            case C_WS_GS:case C_WS_RS:case C_WS_US:case C_WS_SP:
                for(;;) {
                    appendAndTake();
                    switch (next())
                    {
                        case C_WS_HT:case C_WS_VT:case C_WS_FF:case C_WS_FS:
                        case C_WS_GS:case C_WS_RS:case C_WS_US:case C_WS_SP: 
                            break;
                        default:
                            return INLINEWS;
                    }
                }
            case C_WS_CR:case C_WS_LF: 
                return newlinews();
            case EOF:
                return EOF;
            default:
                return any();
        }
    }
    
    /**
     * marks the current character as read
     */
    private final void take()
    {
        cbuf = TAKEN;
    }
    
    /**
     * Returns the next character.
     * 
     * A new character is only read from the stream if {@link #take()}
     * has been called before.
     *  
     * @return the next character
     * @throws IOException reading from the stream failed
     */
    private final int next() throws IOException
    {
        if (cbuf!=TAKEN)
            return cbuf;

        cbuf = reader.read();
        if (cbuf!=TAKEN) position++;
        return cbuf;
    }
    
    /**
     * Appends the current character to the string buffer and 
     * marks the character as read by calling {@link #take()}
     */
    private final void appendAndTake() 
    {
        sbuffer.append((char)cbuf);
        cbuf = TAKEN;
    }
    
    /**
     * Reads (\r|\n)+
     * 
     * @return NEWLINEWS token
     * @throws IOException
     */
    private final int newlinews() throws IOException
    {
        loop: 
        for (boolean skipLF = false;;take())
        {
            if (skipLF)
            {
                skipLF = false;
                if (next() == C_WS_LF) 
                    continue;
            }
            switch (next())
            {
                case C_WS_CR: skipLF = true;   // fall through
                case C_WS_LF: ibuffer++; break;
                default:      break loop;
            }
        } 
        line+=ibuffer;
        newlineposition = position;
        return NEWLINEWS;
    }
    
    private final int abortNumber(int charCount, boolean sign) throws IOException
    {
        if (charCount == 0)
            return any();
        
        if (sign)
        {
            sbuffer.append('-');
            if (charCount == 1)
                return any();
        }
        sbuffer.append(Integer.toString(ibuffer));
        return any();
    }
    
    /**
     * Reads the next number or string.
     * 
     * If the next characters match the regular expression
     * <code>'0' | ('-'? [1..9] [0..9]+ )</code> and the
     * expression is followed by a whitespace or end of file
     * character then the NUMBER token is returned. Otherwise
     * the ANY token is returned.
     *  
     * @return the next number or string
     * @throws IOException
     */
    private final int number() throws IOException
    {
        /*
         * for testing
         *   0 1 -0 -1
         *   -00 -01 -10 -11
         *   0-0 0-1 1-0 1-1
         *   00- 01- 10- 11-
         *   0--0 0--1 1--0 1--1
         */
        
        boolean sign = false;
        final int first = getPosition();
        final int second = first+1;

        // read the number
        loop:for(;;)
        {
            switch (next())
            {
                case'0':
                    if (getPosition() == second)
                    {
                        if (sign)
                        {
                            // -0
                            take();
                            return abortNumber(getPosition()-first, sign);
                        }
                        if (ibuffer==0)
                        {
                            // 00
                            return abortNumber(getPosition()-first, sign);
                        }
                    }
                    
                    
                    // fall down
                case'1':case'2':case'3':case'4':
                case'5':case'6':case'7':case'8':case '9':
                    
                    if (getPosition()>first && ibuffer==0 && !sign)
                    {
                        return abortNumber(getPosition()-first, sign);
                    }
                    
                    int newbuffer = (ibuffer*10)+(cbuf-'0');
                    // check for overflow (not a number)
                    if (newbuffer<ibuffer) return abortNumber(getPosition()-first, sign);
                    // no overflow
                    ibuffer = newbuffer;
                    break;
                case '[': case EOF: case C_WS_CR:case C_WS_LF:
                case C_WS_HT:case C_WS_VT:case C_WS_FF:case C_WS_FS:
                case C_WS_GS:case C_WS_RS:case C_WS_US:case C_WS_SP:
                    // whitespace|eof|'['
                    break loop;
                case '-':
                    // _, --, [0-9]-
                    if (getPosition() == first)
                    {
                        sign = true;
                        break;
                    }
                    // fall down
                default:
                    // not a number
                    return abortNumber(getPosition()-first, sign);
                    
            }
            take();
        }
        
        if (getPosition()-first == 1 && sign)
        {
            return abortNumber(getPosition()-first, sign);
        }
        
        if (sign) ibuffer = -ibuffer;
        return NUMBER;
    }

    /**
     * Matches any character except whitespace, end of file
     * or one of the special characters '[', '], '/', '='
     * 
     * @return ANY
     * @throws IOException
     */
    private final int any() throws IOException
    {
        for(;;)
        {
            switch (next())
            {
                case EOF:
                case C_WS_CR:case C_WS_LF:
                case C_WS_HT:case C_WS_VT:case C_WS_FF:case C_WS_FS:
                case C_WS_GS:case C_WS_RS:case C_WS_US:case C_WS_SP:
                case '[':case ']':case '/':case '=':
                    return ANY;
                default:
                    appendAndTake();
                    // split strings which become too long
                    if (sbuffer.length()>=100)
                        return ANY;
                    break;
            }
        }
    }
    
}