AbstractTokenizer.java example

Explorer
pmd-master
/**
 * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
 */

package net.sourceforge.pmd.cpd;

import java.util.List;

/**
 *
 * @author Zev Blut zb@ubit.com
 * @author Romain PELISSE belaran@gmail.com
 */
public abstract class AbstractTokenizer implements Tokenizer {

    // FIXME depending on subclasses to assign local vars is rather fragile -
    // better to make private and setup via explicit hook methods

    protected List<String> stringToken; // List<String>, should be set by sub
    // classes
    protected List<String> ignorableCharacter; // List<String>, should be set by
    // sub classes
    // FIXME:Maybe an array of 'char'
    // would be better for
    // performance ?
    protected List<String> ignorableStmt; // List<String>, should be set by sub
    // classes
    protected char oneLineCommentChar = '#'; // Most script languages ( shell,
    // ruby, python,...) use this
    // symbol for comment line

    private List<String> code;
    private int lineNumber = 0;
    private String currentLine;

    protected boolean spanMultipleLinesString = true; // Most languages do, so
    // default is true
    protected Character spanMultipleLinesLineContinuationCharacter = null;

    private boolean downcaseString = true;

    @Override
    public void tokenize(SourceCode tokens, Tokens tokenEntries) {
        code = tokens.getCode();

        for (lineNumber = 0; lineNumber < code.size(); lineNumber++) {
            currentLine = code.get(lineNumber);
            int loc = 0;
            while (loc < currentLine.length()) {
                StringBuilder token = new StringBuilder();
                loc = getTokenFromLine(token, loc);
                if (token.length() > 0 && !isIgnorableString(token.toString())) {
                    if (downcaseString) {
                        token = new StringBuilder(token.toString().toLowerCase());
                    }
                    // need to re-think how to link this
                    // if ( CPD.debugEnable ) {
                    // System.out.println("Token added:" + token.toString());
                    // }
                    tokenEntries.add(new TokenEntry(token.toString(), tokens.getFileName(), lineNumber));

                }
            }
        }
        tokenEntries.add(TokenEntry.getEOF());
    }

    private int getTokenFromLine(StringBuilder token, int loc) {
        for (int j = loc; j < currentLine.length(); j++) {
            char tok = currentLine.charAt(j);
            if (!Character.isWhitespace(tok) && !ignoreCharacter(tok)) {
                if (isComment(tok)) {
                    if (token.length() > 0) {
                        return j;
                    } else {
                        return getCommentToken(token, loc);
                    }
                } else if (isString(tok)) {
                    if (token.length() > 0) {
                        return j; // we need to now parse the string as a
                        // separate token.
                    } else {
                        // we are at the start of a string
                        return parseString(token, j, tok);
                    }
                } else {
                    token.append(tok);
                }
            } else {
                if (token.length() > 0) {
                    return j;
                }
            }
            loc = j;
        }
        return loc + 1;
    }

    private int parseString(StringBuilder token, int loc, char stringDelimiter) {
        boolean escaped = false;
        boolean done = false;
        char tok = ' '; // this will be replaced.
        while (loc < currentLine.length() && !done) {
            tok = currentLine.charAt(loc);
            if (escaped && tok == stringDelimiter) { // Found an escaped string
                escaped = false;
            } else if (tok == stringDelimiter && token.length() > 0) {
                // We are done, we found the end of the string...
                done = true;
            } else if (tok == '\\') { // Found an escaped char
                escaped = true;
            } else { // Adding char...
                escaped = false;
            }
            // Adding char to String:" + token.toString());
            token.append(tok);
            loc++;
        }
        // Handling multiple lines string
        if (!done && // ... we didn't find the end of the string
                loc >= currentLine.length() && // ... we have reach the end of
                // the line ( the String is
                // incomplete, for the moment at
                // least)
                spanMultipleLinesString && // ... the language allow multiple
                // line span Strings
                lineNumber < code.size() - 1 // ... there is still more lines to
        // parse
        ) {
            // removes last character, if it is the line continuation (e.g.
            // backslash) character
            if (spanMultipleLinesLineContinuationCharacter != null && token.length() > 0
                    && token.charAt(token.length() - 1) == spanMultipleLinesLineContinuationCharacter.charValue()) {
                token.deleteCharAt(token.length() - 1);
            }
            // parsing new line
            currentLine = code.get(++lineNumber);
            // Warning : recursive call !
            loc = parseString(token, 0, stringDelimiter);
        }
        return loc + 1;
    }

    private boolean ignoreCharacter(char tok) {
        return ignorableCharacter.contains(String.valueOf(tok));
    }

    private boolean isString(char tok) {
        return stringToken.contains(String.valueOf(tok));
    }

    private boolean isComment(char tok) {
        return tok == oneLineCommentChar;
    }

    private int getCommentToken(StringBuilder token, int loc) {
        while (loc < currentLine.length()) {
            token.append(currentLine.charAt(loc++));
        }
        return loc;
    }

    private boolean isIgnorableString(String token) {
        return ignorableStmt.contains(token);
    }
}