SimpleRegexLexer.java example

Explorer
Scute-master
/*
 * Copyright 2008 Ayman Al-Sairafi ayman.alsairafi@gmail.com
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License
 *       at http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package jsyntaxpane.lexers;

import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.text.Segment;
import jsyntaxpane.Lexer;
import jsyntaxpane.Token;
import jsyntaxpane.TokenComparators;
import jsyntaxpane.TokenType;

/**
 * This is a "dynamic" Lexer that will use Regex patterns to parse any document,
 * It is NOT as fast as other JFLex generated lexers.
 *
 * The current implementation is about 20x slower than a JFLex lexer
 * (5000 lines in 100ms, vs 5ms for JFlex lexer)
 *
 * This is still usable for a few 100 lines.  500 lines parse in about 10ms.
 *
 * It also depends on how complex the Regexp and how many of them will actually
 * provide a match.
 *
 * Since KEYWORD TokenType is by order less than IDENTIFIER, the higher
 * precedence of KEYWORD token will be used, even if the same regex matches
 * an IDENTIFIER.  This is a neat side-effect of the ordering of the TokenTypes.
 * We now just need to add any non-overlapping matches.  And since longer matches
 * are found first, we will properly match the longer identifiers which start with
 * a keyword.
 *
 * This behaviour can easily be modified by overriding the {@link compareTo} method
 *
 * @author Ayman Al-Sairafi
 */
public class SimpleRegexLexer implements Lexer {

    public SimpleRegexLexer(Map props) {
        putPatterns(props);
    }

    public SimpleRegexLexer(String propsLocation) throws IOException {
        Properties props = new Properties();
        props.load(new FileReader(propsLocation));
        putPatterns(props);
    }

    @Override
    public void parse(Segment segment, int ofst, List<Token> tokens) {
        TreeSet<Token> allMatches = new TreeSet<Token>(TokenComparators.LONGEST_FIRST);
        // add to ourset all the matches by all our regexes
        for (Map.Entry<TokenType, Pattern> e : patterns.entrySet()) {
            Matcher m = e.getValue().matcher(segment);
            while (m.find()) {
                Token t = new Token(e.getKey(), m.start() + ofst, m.end() - m.start());
                allMatches.add(t);
            }
        }
        int end = -1;
        for (Token t : allMatches) {
            if (t.start > end) {
                tokens.add(t);
                end = t.end();
            }
        }
    }
    Map<TokenType, Pattern> patterns = new HashMap<TokenType, Pattern>();

    public SimpleRegexLexer putPattern(TokenType type, String regex) {
        patterns.put(type, Pattern.compile(regex));
        return this;
    }

    public SimpleRegexLexer putPatterns(Map props) {
        for (Object key : props.keySet()) {
            TokenType t = TokenType.valueOf(key.toString());
            patterns.put(t, Pattern.compile(props.get(key).toString()));
        }
        return this;
    }
}