/** * BSD-style license; for more info see http://pmd.sourceforge.net/license.html * @authors: Zev Blut zb@ubit.com */ package org.rubypeople.rdt.internal.core.pmd; import java.util.List; public class RubyTokenizer implements Tokenizer { private boolean downcaseString = true; public void tokenize(SourceCode tokens, Tokens tokenEntries) { List code = tokens.getCode(); int curLineOffset = 0; for (int i = 0; i < code.size(); i++) { String currentLine = (String) code.get(i); int loc = 0; int startOffset = 0; while (loc < currentLine.length()) { StringBuffer token = new StringBuffer(); startOffset = curLineOffset + loc; loc = getTokenFromLine(currentLine, token, loc); if (token.length() > 0 && !isIgnorableString(token.toString())) { if (downcaseString) { token = new StringBuffer(token.toString().toLowerCase()); } tokenEntries.add(new TokenEntry(token.toString(), tokens.getFileName(), i + 1, startOffset, startOffset + token.length())); } } curLineOffset += currentLine.length(); } tokenEntries.add(TokenEntry.getEOF()); } private int getTokenFromLine(String line, StringBuffer token, int loc) { for (int j = loc; j < line.length(); j++) { char tok = line.charAt(j); if (!Character.isWhitespace(tok) && !ignoreCharacter(tok)) { if (isComment(tok)) { if (token.length() > 0) { return j; } else { return getCommentToken(line, token, loc); } } else if (isString(tok)) { if (token.length() > 0) { //if (loc == lin return j; // we need to now parse the string as a seperate token. } else { // we are at the start of a string return parseString(line, token, j, tok); } } else { token.append(tok); } } else { if (token.length() > 0) { return j; } } loc = j; } return loc + 1; } private int parseString(String line, StringBuffer token, int loc, char stringType) { boolean escaped = false; boolean done = false; //System.out.println("Parsing String:" + stringType); //System.out.println("Starting loc:" + loc); // problem of strings that span multiple lines :-( char tok = ' '; // this will be replaced. while ((loc < line.length()) && !done) { tok = line.charAt(loc); if (escaped && tok == stringType) { // System.out.println("Found an escaped string"); escaped = false; } else if (tok == stringType && (token.length() > 0)) { // we are done // System.out.println("Found an end string"); done = true; } else if (tok == '\\') { // System.out.println("Found an escaped char"); escaped = true; } else { // System.out.println("Adding char:" + tok + ";loc:" + loc); escaped = false; } //System.out.println("Adding char to String:" + token.toString()); token.append(tok); loc++; } return loc + 1; } private boolean ignoreCharacter(char tok) { boolean result = false; switch (tok) { case '{': case '}': case '(': case ')': case ';': case ',': result = true; break; default : result = false; } return result; } private boolean isString(char tok) { boolean result = false; switch (tok) { case '\'': case '"': result = true; break; default: result = false; } return result; } private boolean isComment(char tok) { return tok == '#'; } private int getCommentToken(String line, StringBuffer token, int loc) { while (loc < line.length()) { token.append(line.charAt(loc)); loc++; } return loc; } private boolean isIgnorableString(String token) { return "do".equals(token) || "end".equals(token); } }