/* * * Copyright 2012 lexergen. * This file is part of lexergen. * * lexergen is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * lexergen is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with lexergen. If not, see <http://www.gnu.org/licenses/>. * * lexergen: * A tool to chunk source code into tokens for further processing in a compiler chain. * * Projectgroup: bi, bii * * Authors: Johannes Dahlke * * Module: Softwareprojekt Übersetzerbau 2012 * * Created: Apr. 2012 * Version: 1.0 * */ package de.fuberlin.bii.regextodfaconverter.directconverter.regex; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.UUID; import de.fuberlin.bii.regextodfaconverter.Regex; import de.fuberlin.bii.regextodfaconverter.RegexInvalidException; import de.fuberlin.bii.regextodfaconverter.directconverter.DirectConverterException; import de.fuberlin.bii.regextodfaconverter.directconverter.PositionToPayloadMap; import de.fuberlin.bii.regextodfaconverter.directconverter.lrparser.grammar.Terminal; import de.fuberlin.bii.regextodfaconverter.directconverter.regex.operatortree.RegexOperatorTree; import de.fuberlin.bii.regextodfaconverter.directconverter.regex.operatortree.RegularExpressionElement; import de.fuberlin.bii.regextodfaconverter.directconverter.regex.operatortree.TerminalNode; import de.fuberlin.bii.regextodfaconverter.directconverter.syntaxtree.node.Leaf; import de.fuberlin.bii.regextodfaconverter.directconverter.syntaxtree.node.TreeNode; import de.fuberlin.bii.regextodfaconverter.directconverter.syntaxtree.node.TreeNodeCollection; import de.fuberlin.bii.regextodfaconverter.directconverter.syntaxtree.node.TreeNodeSet; import de.fuberlin.bii.regextodfaconverter.fsm.FiniteStateMachine; import de.fuberlin.bii.regextodfaconverter.fsm.State; import de.fuberlin.bii.regextodfaconverter.fsm.StatePayload; import de.fuberlin.bii.regextodfaconverter.fsm.Transition; import de.fuberlin.bii.utils.Notification; import de.fuberlin.bii.utils.Test; /** * Stellt Funktionalitäten bereit, um einen vereinfachten regulären Ausdruck in * eine DFA umzuwandeln. * * @author Johannes Dahlke * * @see <a * href="http://kontext.fraunhofer.de/haenelt/kurs/folien/Haenelt_FSA_RegExFSA.pdf">Fraunhofer * Institut: Überführung regulärer Ausdrücke in endliche Automaten</a> * @see <a * href="http://kontext.fraunhofer.de/haenelt/kurs/folien/Haenelt_RegEx-FSA-GMY.pdf">Fraunhofer * Institut: Der Algorithmus von Glushkov und McNaughton/Yamada</a> * @see <a * href="http://kontext.fraunhofer.de/haenelt/kurs/folien/FSA-RegA-6.pdf">Endliche * Automaten: Reguläre Mengen, Reguläre Ausdrücke, reguläre Sprachen und * endliche Automaten</a> * @see <a * href="http://kontext.fraunhofer.de/haenelt/kurs/Skripten/FSA-Skript/Haenelt_EA_RegEx2EA.pdf">Überführung * regulärer Ausdrücke in endliche Automaten</a> */ @SuppressWarnings("rawtypes") public class RegexToDfaConverter { /** * Wandelt einen regulären Ausdruck in einen DFA um. * * @param Regex * der reguläre Ausdruck in vereinfachter Form. * @param <StatePayloadType> * der Inhalt, welcher Zuständen zugeordnet sein kann. * @return ein DFA * @throws Exception * */ public static FiniteStateMachine<Character, ? extends de.fuberlin.bii.tokenmatcher.StatePayload> convert( String regex, StatePayload commonPayload) throws DirectConverterException { PositionToPayloadMap<StatePayload> positionToPayloadMap = new PositionToPayloadMap<StatePayload>(); return convert( regex, positionToPayloadMap, commonPayload); } public static FiniteStateMachine<Character, ? extends de.fuberlin.bii.tokenmatcher.StatePayload> convert( RegexToPayloadMap<StatePayload> regexToPayloadMap) throws DirectConverterException { String concatenatedRegex = ""; PositionToPayloadMap<StatePayload> positionToPayloadMap = new PositionToPayloadMap<StatePayload>(); for ( String regex : regexToPayloadMap.keySet()) { if ( !concatenatedRegex.isEmpty()) concatenatedRegex += "|"; concatenatedRegex += "(" + regex +")"; positionToPayloadMap.put( concatenatedRegex.length() -1, regexToPayloadMap.get( regex)); } return convert( concatenatedRegex, positionToPayloadMap); } public static FiniteStateMachine<Character, ? extends de.fuberlin.bii.tokenmatcher.StatePayload> convert( String regex, PositionToPayloadMap<StatePayload> positionToPayloadMap) throws DirectConverterException { return convert( regex, positionToPayloadMap, null); } public static FiniteStateMachine<Character, ? extends de.fuberlin.bii.tokenmatcher.StatePayload> convert( String regex, PositionToPayloadMap<StatePayload> positionToPayloadMap, StatePayload commonPayload) throws DirectConverterException { int regexLength = regex.length(); @SuppressWarnings("unchecked") RegularExpressionElement<StatePayload>[] regularExpression = new RegularExpressionElement[regexLength]; for ( int i = 0; i < regexLength; i++) { regularExpression[i] = new RegularExpressionElement<StatePayload>( regex.charAt( i), positionToPayloadMap.get( i)); } return convert( regularExpression, commonPayload); } public static FiniteStateMachine<Character, ? extends de.fuberlin.bii.tokenmatcher.StatePayload> convert( RegularExpressionElement<StatePayload>[] regularExpression, StatePayload commonPayload) throws DirectConverterException { try { RegexOperatorTree<StatePayload> regexTree = convertRegexToTree( regularExpression); if ( Test.isUnassigned( commonPayload)) commonPayload = getWeakestPayload( regularExpression); FiniteStateMachine<Character, StatePayload> dfa = convertRegexTreeToDfa( regexTree, commonPayload); return dfa; } catch ( Exception e) { Notification.printDebugException( e); String regexExpression = ""; for ( RegularExpressionElement<StatePayload> regularExpressionElement : regularExpression) { regexExpression += regularExpressionElement.getValue(); } throw new DirectConverterException( String.format( "Cannot convert regex '%s' to DFA.", regularExpression)); } } private static StatePayload getWeakestPayload( RegularExpressionElement<StatePayload>[] regularExpression) { StatePayload weakestPayload = null; for ( RegularExpressionElement<StatePayload> regularExpressionElement : regularExpression) { if ( Test.isUnassigned( regularExpressionElement)) continue; if ( Test.isAssigned( weakestPayload)) { if ( Test.isAssigned( regularExpressionElement.getPayload())) { if ( weakestPayload.getPriority() > regularExpressionElement.getPayload().getPriority()) weakestPayload = regularExpressionElement.getPayload(); } } else { weakestPayload = regularExpressionElement.getPayload(); } } return weakestPayload; } /** * * @param Regex * @return * @throws Exception */ private static RegexOperatorTree<StatePayload> convertRegexToTree( RegularExpressionElement<StatePayload>[] regularExpression) throws Exception { RegexOperatorTree<StatePayload> regexTree = new RegexOperatorTree<StatePayload>( regularExpression); return regexTree; } @SuppressWarnings("null") private static StatePayload getBestPayloadFromTreeNodeCollectionForCharacter( TreeNodeCollection collection, Character theCharacter) { StatePayload result = null; for ( TreeNode node : collection) { if ( node instanceof TerminalNode) { @SuppressWarnings("unchecked") RegularExpressionElement<StatePayload> nodeValue = (RegularExpressionElement<StatePayload>)((TerminalNode)node).getValue(); if ( nodeValue.getValue().equals( theCharacter)) { StatePayload currentPayload = nodeValue.getPayload(); if ( Test.isAssigned( currentPayload)) { if (Test.isUnassigned( result)) result = currentPayload; else { if ( result.getPriority() < currentPayload.getPriority()) result = currentPayload; } } } } } return result; } /** * Speichert den Payload für einen Zielzustand ausgehend von einem Quellzustand durch lesen des angegebenen Zeichens in der gegebenen stateToStateMap. * @param stateToStateMap * @param fromState * @param toState * @param theCharacter * @param thePayloadToSet * @return */ private static boolean storePayloadPriorityDependentForTransitionFromStateToStateByCharacter( Map<State,Map<State, Map<Character,StatePayload>>> stateToStateMap, State fromState, State toState, Character theCharacter, StatePayload thePayloadToSet) { Map<State, Map<Character,StatePayload>> stateToCharacterPayloadMap = stateToStateMap.get( toState); if ( Test.isUnassigned( stateToCharacterPayloadMap)) { stateToCharacterPayloadMap = new HashMap<State, Map<Character,StatePayload>>(); stateToStateMap.put( toState, stateToCharacterPayloadMap); } Map<Character,StatePayload> characterToPayloadMap = stateToCharacterPayloadMap.get( fromState); if ( Test.isUnassigned( characterToPayloadMap)) { characterToPayloadMap = new HashMap<Character,StatePayload>(); stateToCharacterPayloadMap.put( fromState, characterToPayloadMap); } StatePayload storedPayload = characterToPayloadMap.get( theCharacter); if ( Test.isAssigned( storedPayload)) { if ( storedPayload.getPriority() < thePayloadToSet.getPriority()) { characterToPayloadMap.put( theCharacter, thePayloadToSet); return true; } } else { characterToPayloadMap.put( theCharacter, thePayloadToSet); return true; } return false; } /** * Liefert den in der stateToStateMap gespeicherten Payload für das angegebene Zeichen. * @param stateToStateMap * @param fromState * @param toState * @param theCharacter * @return Den gefundenen Payload oder null. */ private static StatePayload getPayloadForTransitionFromStateToStateByCharacter( Map<State,Map<State, Map<Character,StatePayload>>> stateToStateMap, State fromState, State toState, Character theCharacter) { Map<State, Map<Character,StatePayload>> stateToCharacterPayloadMap = stateToStateMap.get( toState); if ( Test.isUnassigned( stateToCharacterPayloadMap)) return null; Map<Character,StatePayload> characterToPayloadMap = stateToCharacterPayloadMap.get( fromState); if ( Test.isUnassigned( characterToPayloadMap)) return null; StatePayload storedPayload = characterToPayloadMap.get( theCharacter); if ( Test.isUnassigned( storedPayload)) return null; return storedPayload; } /** * Liefert einen der Payloads mit der höchsten Priorität für einen gegeben Zustand mit Blick auf die stateToStateMap Struktur. * @param stateToStateMap * @param theState * @return */ @SuppressWarnings("null") private static StatePayload getBestPayloadForState( Map<State,Map<State, Map<Character,StatePayload>>> stateToStateMap, State theState) { Map<State, Map<Character,StatePayload>> stateToCharacterPayloadMap = stateToStateMap.get( theState); if ( Test.isUnassigned( stateToCharacterPayloadMap)) return null; StatePayload result = null; for ( State<Character, StatePayload> sourceState : stateToCharacterPayloadMap.keySet()) { Map<Character,StatePayload> characterToPayloadMap = stateToCharacterPayloadMap.get( sourceState); for ( Character character : characterToPayloadMap.keySet()) { StatePayload storedPayload = characterToPayloadMap.get( character); if ( Test.isUnassigned( storedPayload)) continue; if ( Test.isUnassigned( result)) result = storedPayload; else if ( result.getPriority() < storedPayload.getPriority()) { result = storedPayload; } } } return result; } /** * Liefert einen der Payloads mit der niedrigsten Priorität für einen gegeben Zustand mit Blick auf die stateToStateMap Struktur. * @param stateToStateMap * @param theState * @return */ @SuppressWarnings("null") private static StatePayload getWeakestPayloadForState( Map<State,Map<State, Map<Character,StatePayload>>> stateToStateMap, State theState) { Map<State, Map<Character,StatePayload>> stateToCharacterPayloadMap = stateToStateMap.get( theState); if ( Test.isUnassigned( stateToCharacterPayloadMap)) return null; StatePayload result = null; for ( State<Character, StatePayload> sourceState : stateToCharacterPayloadMap.keySet()) { Map<Character,StatePayload> characterToPayloadMap = stateToCharacterPayloadMap.get( sourceState); for ( Character character : characterToPayloadMap.keySet()) { StatePayload storedPayload = characterToPayloadMap.get( character); if ( Test.isUnassigned( storedPayload)) continue; if ( Test.isUnassigned( result)) result = storedPayload; else if ( result.getPriority() > storedPayload.getPriority()) { result = storedPayload; } } } return result; } /** * Konvertiert einen annotierten Syntaxbaum in einen deterministischen * endlichen Automaten * * @param syntaxTree * @return * @throws DirectConverterException * @throws Exception */ @SuppressWarnings("unchecked") private static FiniteStateMachine<Character, StatePayload> convertRegexTreeToDfa( RegexOperatorTree<StatePayload> regexTree, StatePayload commonPayload) throws DirectConverterException { try { HashMap<TreeNodeCollection, State<Character, StatePayload>> unhandledStates = new HashMap<TreeNodeCollection, State<Character, StatePayload>>(); HashMap<TreeNodeCollection, State<Character, StatePayload>> handledStates = new HashMap<TreeNodeCollection, State<Character, StatePayload>>(); FiniteStateMachine<Character, StatePayload> dfa = new FiniteStateMachine<Character, StatePayload>(); // add start state as unhandled unhandledStates.put( regexTree.getFirstPositions().get( regexTree.getRoot()), dfa.getInitialState()); // maps the target states to a map of source states with corresponding payloads Map<State,Map<State, Map<Character,StatePayload>>> payloadToStateMap = new HashMap<State,Map<State, Map<Character,StatePayload>>>(); StatePayload currentStatePayload = null; Set<RegularExpressionElement<StatePayload>> alphabetSubset = new HashSet<RegularExpressionElement<StatePayload>>(); for ( Leaf leaf : regexTree.getLeafSet()) { alphabetSubset.add( (RegularExpressionElement<StatePayload> ) leaf.getValue()); } State<Character, StatePayload> currentState; TreeNodeCollection currentCollection; while ( !unhandledStates.isEmpty()) { // get the next unhandled state ... currentCollection = unhandledStates.keySet().iterator().next(); currentState = unhandledStates.remove( currentCollection); dfa.setCurrentState( currentState); // ... and mark it as handled handledStates.put( currentCollection, currentState); HashMap<Character, TreeNodeCollection> stateCandidates = new HashMap<Character, TreeNodeCollection>(); for ( RegularExpressionElement<StatePayload> currentRegexElement : alphabetSubset) { TreeNodeCollection followPositionsOfTerminal = new TreeNodeSet(); for ( TreeNode node : currentCollection) { if ( node instanceof TerminalNode) { RegularExpressionElement terminalNodeRegexElement = (RegularExpressionElement)((TerminalNode)node).getValue(); if ( terminalNodeRegexElement.equals( currentRegexElement)) { // use equals() instead of equalsTotally() followPositionsOfTerminal.addAll( regexTree.getFollowPositions().get( node)); } } } // if set not empty, then add set to states State<Character, StatePayload> targetState = null; if ( !followPositionsOfTerminal.isEmpty()) { // ermittle Übergang-spezifischen Payload currentStatePayload = getBestPayloadFromTreeNodeCollectionForCharacter( currentCollection, currentRegexElement.getValue()); // Oder falls keiner definiert, dann den allgemeinen Payload, sofern es sich um das Ende handelt if ( Test.isUnassigned( currentStatePayload) && followPositionsOfTerminal.contains( regexTree.getTerminatorNode())) currentStatePayload = commonPayload; // Ansonsten wie im Algorithmus von Glushkov / McNaughton and Yamada beschrieben verfahren if ( !handledStates.containsKey( followPositionsOfTerminal) && !unhandledStates.containsKey( followPositionsOfTerminal)) { targetState = new State<Character, StatePayload>(); unhandledStates.put( followPositionsOfTerminal, targetState); } else if ( handledStates.containsKey( followPositionsOfTerminal)) { targetState = handledStates.get( followPositionsOfTerminal); } else { targetState = unhandledStates.get( followPositionsOfTerminal); } // setze Übergang dfa.addTransition( targetState, currentRegexElement.getValue()); // falls das Terminalzeichen # folgt, if ( followPositionsOfTerminal.contains( regexTree.getTerminatorNode())) { // dann ist der Zustand ein Endzustand ... // falls kein spezifischer Payload gegeben ist .. if ( Test.isUnassigned( currentStatePayload)) { // dann setzte denn allgemeinen Payload currentStatePayload = commonPayload; } // Ist der Folgezustand bereits als Endzustand markiert, ... if ( targetState.isFiniteState()) { // ... dann, sofern ein Payload gegeben ist if ( Test.isAssigned( currentStatePayload)) { // Teste, ob dem Folgezustand bereits ein Payload zuvor zugewiesen wurde if ( Test.isAssigned( targetState.getPayload())) { // In diesem Fall vergleiche die Prioritäten den neuen Payloads mit dem alten Payload if ( targetState.getPayload().getPriority() < currentStatePayload.getPriority()) // und setze den neuen Payload nur, wenn dieser eine höhere Priorität hat targetState.setPayload( currentStatePayload); // merke den Payload und dessen Priorität storePayloadPriorityDependentForTransitionFromStateToStateByCharacter( payloadToStateMap, currentState, targetState, currentRegexElement.getValue(), currentStatePayload); } else { // anderenfalls (es wurde kein früherer Payload gefunden) , dann detzte den neuen Payload bedingungslos. targetState.setPayload( currentStatePayload); // merke den Payload und dessen Priorität storePayloadPriorityDependentForTransitionFromStateToStateByCharacter( payloadToStateMap, currentState, targetState, currentRegexElement.getValue(), currentStatePayload); } } } else { // der Folgezustand ist bislang kein Endzustand // dann setze den Zustand finite targetState.setFinite( true); // and set payload (be care: currentStatePayload can here be null, if Commonpayload is null) if ( Test.isAssigned( currentStatePayload)) { targetState.setPayload( currentStatePayload); // speichere Datum für die Nachbereitung storePayloadPriorityDependentForTransitionFromStateToStateByCharacter(payloadToStateMap, currentState, targetState, currentRegexElement.getValue(), currentStatePayload); } } } } } } //------------------------------------ // +++ slightly Modification of algorithm of Glushkov / McNaughton and Yamada +++ // --------------------------------------- // posthumously untie the finate and terminating node by payloads Map<UUID, State<Character, StatePayload>> dfaStates = (Map<UUID, State<Character, StatePayload>>) dfa.getStates().clone(); State<Character, StatePayload> currentDfaState; Set<UUID> knownFiniteTerminateStates = new HashSet<UUID>(); Map<UUID,StatePayload> knownFiniteIntermediateStates = new HashMap<UUID, StatePayload>(); for ( UUID stateId : dfaStates.keySet()) { // Untersuche alle Zustände auf Übergänge in den final Zustand currentDfaState = dfaStates.get( stateId); Set< Transition<Character, StatePayload>> transitionSet = currentDfaState.getTransitions(); Set< Transition<Character, StatePayload>> transitionSetCopy = (Set<Transition<Character, StatePayload>>) currentDfaState.getTransitions().clone(); for ( Transition<Character, StatePayload> transition : transitionSetCopy) { // wenn der Übergang in einen final Zustand führt if ( transition.getState().isFiniteState()) { // get original payload State<Character, StatePayload> targetState = transition.getState(); State<Character, StatePayload> sourceState = currentDfaState; StatePayload payload = getPayloadForTransitionFromStateToStateByCharacter( payloadToStateMap, sourceState, targetState, transition.getCondition()); if ( Test.isAssigned( payload)) { if ( transition.getState().getElementsOfOutgoingTransitions().isEmpty()) { // Fall 1: Es gehen aus diesem Endzustand keine weiteren Übergänge mehr aus. ->(F) if ( !knownFiniteTerminateStates.contains( targetState.getUUID())) { // the first transition must not handled knownFiniteTerminateStates.add( targetState.getUUID()); // but update the payload targetState.setPayload( payload); } else { Character terminal = transition.getCondition(); // Biege den Übergang auf einen neuen Endzustand um. transitionSet.remove( transition); State<Character, StatePayload> newFinalState = new State<Character, StatePayload>( payload, true); dfa.setCurrentState( currentDfaState); dfa.addTransition( newFinalState, terminal); } } else { // Fall 2: Es ist ein akzeptierender Zustand, aber es führen auch wieder Übergänge heraus. ->(F)-> StatePayload weakestPayload = null; // Merke den Zustand mit ausgerechnetem niederwertigsten Payload beim ersten Besuch if ( !knownFiniteIntermediateStates.containsKey( targetState.getUUID())) { weakestPayload = getWeakestPayloadForState( payloadToStateMap, targetState); targetState.setPayload( weakestPayload); knownFiniteIntermediateStates.put( targetState.getUUID(), weakestPayload); } else { weakestPayload = knownFiniteIntermediateStates.get( targetState.getUUID()); } // Wenn der Payload bereits der niederwertigste ist, dann belasse die Übergänge wie gehabt if ( Test.isUnassigned( weakestPayload) || payload.equals( weakestPayload)) { // do nothing continue; } // anderenfalls, wenn der Payload höherwertig ist, dann füge einen akzeptierenden Zwischenzustand ein // assert payload != weakestpayload Character terminal = transition.getCondition(); // füge einen akzeptierenden Zwischenzustand mit dem entsprechenden Payload ein State<Character, StatePayload> interState = new State<Character, StatePayload>( payload, true); dfa.addTransition( sourceState, interState, terminal); // kopiere alle ausgehenden Zustände der Ursprungszustand for ( Transition<Character, StatePayload> targetTransition : targetState.getTransitions()) { dfa.addTransition( interState, targetTransition.getState(), targetTransition.getCondition()); } // entferne die direkte Verbindung in den ursprünglichen targetState transitionSet.remove( transition); } } } } } assert dfa.isDeterministic(); return dfa; } catch ( Exception e) { Notification.printDebugException( e); throw new DirectConverterException( "Cannot convert syntax tree to DFA. " + e.getMessage()); } } }