/** * BSD-style license; for more info see http://pmd.sourceforge.net/license.html */ package net.sourceforge.pmd.cpd; import java.util.List; /** * * @author Zev Blut zb@ubit.com * @author Romain PELISSE belaran@gmail.com */ public abstract class AbstractTokenizer implements Tokenizer { // FIXME depending on subclasses to assign local vars is rather fragile - // better to make private and setup via explicit hook methods protected List<String> stringToken; // List<String>, should be set by sub // classes protected List<String> ignorableCharacter; // List<String>, should be set by // sub classes // FIXME:Maybe an array of 'char' // would be better for // performance ? protected List<String> ignorableStmt; // List<String>, should be set by sub // classes protected char oneLineCommentChar = '#'; // Most script languages ( shell, // ruby, python,...) use this // symbol for comment line private List<String> code; private int lineNumber = 0; private String currentLine; protected boolean spanMultipleLinesString = true; // Most languages do, so // default is true protected Character spanMultipleLinesLineContinuationCharacter = null; private boolean downcaseString = true; @Override public void tokenize(SourceCode tokens, Tokens tokenEntries) { code = tokens.getCode(); for (lineNumber = 0; lineNumber < code.size(); lineNumber++) { currentLine = code.get(lineNumber); int loc = 0; while (loc < currentLine.length()) { StringBuilder token = new StringBuilder(); loc = getTokenFromLine(token, loc); if (token.length() > 0 && !isIgnorableString(token.toString())) { if (downcaseString) { token = new StringBuilder(token.toString().toLowerCase()); } // need to re-think how to link this // if ( CPD.debugEnable ) { // System.out.println("Token added:" + token.toString()); // } tokenEntries.add(new TokenEntry(token.toString(), tokens.getFileName(), lineNumber)); } } } tokenEntries.add(TokenEntry.getEOF()); } private int getTokenFromLine(StringBuilder token, int loc) { for (int j = loc; j < currentLine.length(); j++) { char tok = currentLine.charAt(j); if (!Character.isWhitespace(tok) && !ignoreCharacter(tok)) { if (isComment(tok)) { if (token.length() > 0) { return j; } else { return getCommentToken(token, loc); } } else if (isString(tok)) { if (token.length() > 0) { return j; // we need to now parse the string as a // separate token. } else { // we are at the start of a string return parseString(token, j, tok); } } else { token.append(tok); } } else { if (token.length() > 0) { return j; } } loc = j; } return loc + 1; } private int parseString(StringBuilder token, int loc, char stringDelimiter) { boolean escaped = false; boolean done = false; char tok = ' '; // this will be replaced. while (loc < currentLine.length() && !done) { tok = currentLine.charAt(loc); if (escaped && tok == stringDelimiter) { // Found an escaped string escaped = false; } else if (tok == stringDelimiter && token.length() > 0) { // We are done, we found the end of the string... done = true; } else if (tok == '\\') { // Found an escaped char escaped = true; } else { // Adding char... escaped = false; } // Adding char to String:" + token.toString()); token.append(tok); loc++; } // Handling multiple lines string if (!done && // ... we didn't find the end of the string loc >= currentLine.length() && // ... we have reach the end of // the line ( the String is // incomplete, for the moment at // least) spanMultipleLinesString && // ... the language allow multiple // line span Strings lineNumber < code.size() - 1 // ... there is still more lines to // parse ) { // removes last character, if it is the line continuation (e.g. // backslash) character if (spanMultipleLinesLineContinuationCharacter != null && token.length() > 0 && token.charAt(token.length() - 1) == spanMultipleLinesLineContinuationCharacter.charValue()) { token.deleteCharAt(token.length() - 1); } // parsing new line currentLine = code.get(++lineNumber); // Warning : recursive call ! loc = parseString(token, 0, stringDelimiter); } return loc + 1; } private boolean ignoreCharacter(char tok) { return ignorableCharacter.contains(String.valueOf(tok)); } private boolean isString(char tok) { return stringToken.contains(String.valueOf(tok)); } private boolean isComment(char tok) { return tok == oneLineCommentChar; } private int getCommentToken(StringBuilder token, int loc) { while (loc < currentLine.length()) { token.append(currentLine.charAt(loc++)); } return loc; } private boolean isIgnorableString(String token) { return ignorableStmt.contains(token); } }