/* * * Copyright 2012 lexergen. * This file is part of lexergen. * * lexergen is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * lexergen is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with lexergen. If not, see <http://www.gnu.org/licenses/>. * * lexergen: * A tool to chunk source code into tokens for further processing in a compiler chain. * * Projectgroup: bi, bii * * Authors: Johannes Dahlke * * Module: Softwareprojekt Übersetzerbau 2012 * * Created: Apr. 2012 * Version: 1.0 * */ package de.fuberlin.bii.regextodfaconverter.directconverter; import org.junit.Assert; import org.junit.Test; import de.fuberlin.bii.bufferedreader.BufferedLexemeReader; import de.fuberlin.bii.bufferedreader.LexemeReader; import de.fuberlin.bii.regextodfaconverter.MinimalDfa; import de.fuberlin.bii.regextodfaconverter.directconverter.regex.RegexToDfaConverter; import de.fuberlin.bii.regextodfaconverter.directconverter.regex.RegexToPayloadMap; import de.fuberlin.bii.regextodfaconverter.fsm.FiniteStateMachine; import de.fuberlin.bii.regextodfaconverter.fsm.StatePayload; import de.fuberlin.bii.tokenmatcher.LexemIdentificationException; import de.fuberlin.bii.tokenmatcher.Token; import de.fuberlin.bii.tokenmatcher.Tokenizer; import de.fuberlin.bii.tokenmatcher.attributes.ParseIntAttribute; import de.fuberlin.bii.tokenmatcher.attributes.ParseStringAttribute; import de.fuberlin.bii.tokenmatcher.attributes.StringAttribute; import de.fuberlin.bii.utils.Notification; /** * * Testet die Erkennung von Token mittels eines Dfa's, der direkt als DFA aufgebaut wurde. * * @author Johannes Dahlke * */ public class RegexToDfaTest { @SuppressWarnings({ "unchecked", "rawtypes", "static-method"}) @Test public void testDeterminism() throws Exception { Notification.enableDebugPrinting(); Notification.enableDebugInfoPrinting(); RegexToPayloadMap<StatePayload> regexToPayloadMap = new RegexToPayloadMap<StatePayload>(); regexToPayloadMap.put( "(a|b)*abb", new StatePayload( "NUM", new ParseIntAttribute())); // regexToPayloadMap.put( "c(1|2)*3", new de.fuberlin.bii.regextodfaconverter.fsm.StatePayload( "OP", new ParseStringAttribute())); FiniteStateMachine<Character, ? extends de.fuberlin.bii.tokenmatcher.StatePayload> fsm = new RegexToDfaConverter() .convert( regexToPayloadMap); LexemeReader lexemeReader = new BufferedLexemeReader("tests/resources/de/fuberlin/bii/source/tokenmatcher/regex.fun");// new SimpleLexemeReader( Notification.printDebugInfoMessage( fsm.toString()); Notification.printDebugInfoMessage( (new MinimalDfa( fsm)).toString()); Notification.printDebugInfoMessage( "Deterministic: " + fsm.isDeterministic()); Assert.assertTrue( fsm.isDeterministic()); } @SuppressWarnings({ "rawtypes", "unchecked", "static-method"}) @Test public void testSubTokenRecognition() throws Exception { Notification.enableDebugPrinting(); Notification.enableDebugInfoPrinting(); RegexToPayloadMap<StatePayload> regexToPayloadMap = new RegexToPayloadMap<StatePayload>(); regexToPayloadMap.put( "def", new de.fuberlin.bii.regextodfaconverter.fsm.StatePayload( "SYM", new StringAttribute( "DEF"), 2)); regexToPayloadMap.put( "if", new de.fuberlin.bii.regextodfaconverter.fsm.StatePayload( "SYM", new StringAttribute( "IF"), 1)); regexToPayloadMap.put( "[ifdef]*", new de.fuberlin.bii.regextodfaconverter.fsm.StatePayload( "ID", new ParseStringAttribute(),0)); FiniteStateMachine<Character, ? extends de.fuberlin.bii.tokenmatcher.StatePayload> fsm = new RegexToDfaConverter() .convert( regexToPayloadMap); LexemeReader lexemeReader = new BufferedLexemeReader("tests/resources/de/fuberlin/bii/source/tokenmatcher/regex2.fun");// new SimpleLexemeReader( Notification.printDebugInfoMessage( fsm.toString()); Notification.printDebugInfoMessage( (new MinimalDfa( fsm)).toString()); Notification.printDebugInfoMessage( "Deterministic: " + fsm.isDeterministic()); Tokenizer tokenizer = new Tokenizer( lexemeReader, new MinimalDfa( fsm)); Token currentToken = null; Boolean tokenIdentificationFailed = false; // recognize symbol if currentToken = tokenizer.getNextToken(); Assert.assertEquals( "<SYM, IF>",currentToken.toString()); // recognize identifier ide currentToken = tokenizer.getNextToken(); Assert.assertEquals( "<ID, ide>",currentToken.toString()); // recognize symbol def currentToken = tokenizer.getNextToken(); Assert.assertEquals( "<SYM, DEF>",currentToken.toString()); } @SuppressWarnings({ "unchecked", "rawtypes", "static-method"}) @Test public void testTokenRecognition() throws Exception { Notification.enableDebugPrinting(); Notification.enableDebugInfoPrinting(); RegexToPayloadMap<StatePayload> regexToPayloadMap = new RegexToPayloadMap<StatePayload>(); regexToPayloadMap.put( "(1|2)*3", new de.fuberlin.bii.regextodfaconverter.fsm.StatePayload( "NUM", new ParseIntAttribute(), 2)); regexToPayloadMap.put( "c[12]{0,}3", new de.fuberlin.bii.regextodfaconverter.fsm.StatePayload( "ID", new ParseStringAttribute(), 1)); regexToPayloadMap.put( "[1-5\\-A]", new de.fuberlin.bii.regextodfaconverter.fsm.StatePayload( "CHAR", new ParseStringAttribute(),0)); FiniteStateMachine<Character, ? extends de.fuberlin.bii.tokenmatcher.StatePayload> fsm = new RegexToDfaConverter() .convert( regexToPayloadMap); LexemeReader lexemeReader = new BufferedLexemeReader("tests/resources/de/fuberlin/bii/source/tokenmatcher/regex.fun");// new SimpleLexemeReader( Notification.printDebugInfoMessage( fsm.toString()); Notification.printDebugInfoMessage( (new MinimalDfa( fsm)).toString()); Notification.printDebugInfoMessage( "Deterministic: " + fsm.isDeterministic()); Tokenizer tokenizer = new Tokenizer( lexemeReader, new MinimalDfa( fsm)); Token currentToken = null; Boolean tokenIdentificationFailed = false; // do not match first "c" try { tokenizer.getNextToken(); } catch (LexemIdentificationException e) { tokenIdentificationFailed = true; } Assert.assertTrue( tokenIdentificationFailed); // recognize number 123 currentToken = tokenizer.getNextToken(); Assert.assertEquals( "<NUM, 123>",currentToken.toString()); // it must be an integer Assert.assertTrue( currentToken.getAttribute() instanceof Integer); // recognize string c123 currentToken = tokenizer.getNextToken(); Assert.assertEquals( "<ID, c123>",currentToken.toString()); // it must be a string Assert.assertTrue( currentToken.getAttribute() instanceof String); // do not match "b" try { tokenizer.getNextToken(); } catch (LexemIdentificationException e) { tokenIdentificationFailed = true; } Assert.assertTrue( tokenIdentificationFailed); // recognize string c3 currentToken = tokenizer.getNextToken(); Assert.assertEquals( "<ID, c3>",currentToken.toString()); // it must be a string Assert.assertTrue( currentToken.getAttribute() instanceof String); // recognize string 5 currentToken = tokenizer.getNextToken(); Assert.assertEquals( "<CHAR, 5>",currentToken.toString()); // recognize string - currentToken = tokenizer.getNextToken(); Assert.assertEquals( "<CHAR, ->",currentToken.toString()); // recognize string A currentToken = tokenizer.getNextToken(); Assert.assertEquals( "<CHAR, A>",currentToken.toString()); } }