/* * * Copyright 2012 lexergen. * This file is part of lexergen. * * lexergen is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * lexergen is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with lexergen. If not, see <http://www.gnu.org/licenses/>. * * lexergen: * A tool to chunk source code into tokens for further processing in a compiler chain. * * Projectgroup: bi, bii * * Authors: Johannes Dahlke * * Module: Softwareprojekt Übersetzerbau 2012 * * Created: Apr. 2012 * Version: 1.0 * */ package de.fuberlin.bii.tokenmatcher; import de.fuberlin.bii.tokenmatcher.attributes.Attribute; import de.fuberlin.bii.tokenmatcher.errorhandler.ErrorCorrector; import de.fuberlin.bii.tokenmatcher.errorhandler.ErrorCorrectorException; import de.fuberlin.bii.utils.Notification; import de.fuberlin.bii.bufferedreader.LexemeReader; import de.fuberlin.bii.bufferedreader.LexemeReaderException; import de.fuberlin.bii.bufferedreader.SpecialChars; /** * Der Tokenizer implementiert die {@link LexerToParserInterface}-Schnittstelle, über welche der Parser Token anfordern kann. * Auf Anforderung eines Token reagiert der Tokenizer seinerseits durch zeichenweises Anfordern der Eingabe und speist damit einen deterministischen endlichen Automaten. * Akzeptiert der DEA die Eingabe, so liefert der Tokenizer den erkannten Token, anderenfalls leitet der Tokenizer eine Fehlerbahandlung nach gewählten Fehlerkorrekturmodus ein. * * @author Johannes Dahlke * */ public class Tokenizer implements LexerToParserInterface { private DeterministicFiniteAutomata<Character, StatePayload> dfa; private LexemeReader lexemeReader; private ErrorCorrector errorCorrector; private int currentLine = 1; private int currentPositionInLine = 0; private int lastLine = 1; private int lastPositionInLine = 0; private enum ReadMode { READ_NORMAL, READ_BLOCK_COMMENT, READ_LINE_COMMENT } private ReadMode readMode = ReadMode.READ_NORMAL; public Tokenizer( LexemeReader lexemeReader, DeterministicFiniteAutomata<Character, StatePayload> dfa) throws Exception { super(); this.dfa = dfa; this.lexemeReader = lexemeReader; errorCorrector = new ErrorCorrector( ErrorCorrector.CorrectionMode.PANIC_MODE);//Settings.getErrorCorrectionMode()); } public Token getNextToken() throws LexemeReaderException, LexemIdentificationException { Character currentChar; String currentLexem = ""; dfa.resetToInitialState(); boolean eofReached = false; while ( !eofReached) { currentChar = lexemeReader.getNextChar(); currentPositionInLine++; // handle white spaces if ( currentLexem.isEmpty() // Nur wenn nicht bereits ein Lexem verarbeitet wird. // Soll ermöglichen, dass auch ein Zeichen über das Ende des zu lesenden // Lexem gelesen werden kann, auch wenn es ein whitespace ist. && SpecialChars.isWhiteSpace( currentChar)) { // count newlines if( SpecialChars.isNewLine( currentChar)) { // handle \r\n for windows systems if ( currentChar == SpecialChars.CHAR_CARRIAGE_RETURN) { if ( lexemeReader.getNextChar() != SpecialChars.CHAR_LINE_FEED) lexemeReader.stepBackward( 1); } currentLine++; currentPositionInLine = 0; } // skip whitespaces continue; } // if we read EOF and there is no lexem left if ( SpecialChars.isEOF( currentChar) && currentLexem.isEmpty()) { // then skip eofReached = true; break; } if ( dfa.canChangeStateByElement( currentChar)) { currentLexem += currentChar; dfa.changeStateByElement( currentChar); // TODO if ( dfa.getCurrentState().isFiniteState()) then remember in this possible match (error handling aspect) } else if ( !currentLexem.isEmpty() && dfa.getCurrentState().isFiniteState()) { StatePayload payload = dfa.getCurrentState().getPayload(); // Lesezeiger zurücksetzen um das, was zuviel gelesen wurde. // In dieser implementierung immer 1 Zeichen if ( currentChar != SpecialChars.CHAR_EOF) lexemeReader.stepBackward( 1); // Token erstellen String tokenType = payload.getTokenType(); Attribute attribute = payload.getAttribute(); Object attributeValue = attribute.lexemToValue( currentLexem); Token recognisedToken = new Token( tokenType, attributeValue, currentLine, currentPositionInLine); // gelesenenes Lexem akzeptieren lexemeReader.accept(); // update position counter lastLine = currentLine; lastPositionInLine = currentPositionInLine; // Fehlerbehandler rücksetzen errorCorrector.reset(); // filter comments if ( ( readMode == ReadMode.READ_NORMAL) && ( Token.isTokenStartingBlockComment( recognisedToken))) { readMode = ReadMode.READ_BLOCK_COMMENT; while ( !Token.isTokenEndingBlockComment( getNextToken())){ // ignore comment block } readMode = ReadMode.READ_NORMAL; return getNextToken(); } else if ( ( readMode == ReadMode.READ_NORMAL) && ( Token.isTokenLineComment( recognisedToken))) { readMode = ReadMode.READ_LINE_COMMENT; int thisLine = currentLine; while ( thisLine == currentLine){ // ignore remaining line recognisedToken = getNextToken(); if ( recognisedToken.isEofToken()) { eofReached = true; break; } } readMode = ReadMode.READ_NORMAL; return recognisedToken; } else return recognisedToken; } else if ( readMode == ReadMode.READ_NORMAL){ // error handling String mismatchMessage = ""; try { mismatchMessage = errorCorrector.handleMismatch( currentChar, lexemeReader, dfa, currentLine, currentPositionInLine); } catch ( ErrorCorrectorException e) { // then skip Notification.printMismatchMessage( String.format( "Cannot resolve lexem '%s'. Abort lexing.", currentLexem)); eofReached = true; break; } // Otherwise, the error corrector has found a solution that solve the problem. // But first, we let the user know about the conflict by throwing an exception Notification.printMismatchMessage( mismatchMessage); throw new LexemIdentificationException( mismatchMessage); } else { // ignore, cause we scan a comment at the moment } } return Token.getEofToken(); } public void reset() throws LexemeReaderException { dfa.resetToInitialState(); lexemeReader.reopen(); } }