/** * BSD-style license; for more info see http://pmd.sourceforge.net/license.html */ package net.sourceforge.pmd.cpd; import java.io.BufferedReader; import java.io.CharArrayReader; import java.io.Closeable; import java.io.IOException; import java.io.PushbackReader; import java.util.Properties; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.RandomStringUtils; /** * This class does a best-guess try-anything tokenization. * * @author jheintz */ public class CsTokenizer implements Tokenizer { private boolean ignoreUsings = false; public void setProperties(Properties properties) { if (properties.containsKey(IGNORE_USINGS)) { ignoreUsings = Boolean.parseBoolean(properties.getProperty(IGNORE_USINGS, "false")); } } @Override public void tokenize(SourceCode sourceCode, Tokens tokenEntries) { Tokenizer tokenizer = new Tokenizer(sourceCode.getCodeBuffer().toString()); Token token = tokenizer.getNextToken(); while (!token.equals(Token.EOF)) { Token lookAhead = tokenizer.getNextToken(); // Ignore using directives // Only using directives should be ignored, because these are used // to import namespaces // // Using directive: 'using System.Math;' // Using statement: 'using (Font font1 = new Font(..)) { .. }' if (ignoreUsings && "using".equals(token.image) && !"(".equals(lookAhead.image)) { // We replace the 'using' token by a random token, because it // should not be part of // any duplication block. When we omit it from the token stream, // there is a change that // we get a duplication block that starts before the 'using' // directives and ends afterwards. String randomTokenText = RandomStringUtils.randomAlphanumeric(20); token = new Token(randomTokenText, token.lineNumber); // Skip all other tokens of the using directive to prevent a // partial matching while (!";".equals(lookAhead.image) && !lookAhead.equals(Token.EOF)) { lookAhead = tokenizer.getNextToken(); } } if (!";".equals(token.image)) { tokenEntries.add(new TokenEntry(token.image, sourceCode.getFileName(), token.lineNumber)); } token = lookAhead; } tokenEntries.add(TokenEntry.getEOF()); IOUtils.closeQuietly(tokenizer); } public void setIgnoreUsings(boolean ignoreUsings) { this.ignoreUsings = ignoreUsings; } private static class Tokenizer implements Closeable { private boolean endOfFile; private int line; private final PushbackReader reader; Tokenizer(String sourceCode) { endOfFile = false; line = 1; reader = new PushbackReader(new BufferedReader(new CharArrayReader(sourceCode.toCharArray()))); } public Token getNextToken() { if (endOfFile) { return Token.EOF; } try { int ic = reader.read(); char c; StringBuilder b; while (ic != -1) { c = (char) ic; switch (c) { // new line case '\n': line++; ic = reader.read(); break; // white space case ' ': case '\t': case '\r': ic = reader.read(); break; case ';': return new Token(";", line); // < << <= <<= > >> >= >>= case '<': case '>': ic = reader.read(); if (ic == '=') { return new Token(c + "=", line); } else if (ic == c) { ic = reader.read(); if (ic == '=') { return new Token(c + c + "=", line); } else { reader.unread(ic); return new Token(String.valueOf(c) + c, line); } } else { reader.unread(ic); return new Token(String.valueOf(c), line); } // = == & &= && | |= || + += ++ - -= -- case '=': case '&': case '|': case '+': case '-': ic = reader.read(); if (ic == '=' || ic == c) { return new Token(c + String.valueOf((char) ic), line); } else { reader.unread(ic); return new Token(String.valueOf(c), line); } // ! != * *= % %= ^ ^= ~ ~= case '!': case '*': case '%': case '^': case '~': ic = reader.read(); if (ic == '=') { return new Token(c + "=", line); } else { reader.unread(ic); return new Token(String.valueOf(c), line); } // strings & chars case '"': case '\'': int beginLine = line; b = new StringBuilder(); b.append(c); while ((ic = reader.read()) != c) { if (ic == -1) { break; } b.append((char) ic); if (ic == '\\') { int next = reader.read(); if (next != -1) { b.append((char) next); if (next == '\n') { line++; } } } else if (ic == '\n') { line++; } } if (ic != -1) { b.append((char) ic); } return new Token(b.toString(), beginLine); // / /= /*...*/ //... case '/': ic = reader.read(); c = (char) ic; switch (c) { case '*': // int beginLine = line; int state = 1; b = new StringBuilder(); b.append("/*"); while ((ic = reader.read()) != -1) { c = (char) ic; b.append(c); if (c == '\n') { line++; } if (state == 1) { if (c == '*') { state = 2; } } else { if (c == '/') { ic = reader.read(); break; } else if (c != '*') { state = 1; } } } // ignore the /* comment // tokenEntries.add(new TokenEntry(b.toString(), // sourceCode.getFileName(), beginLine)); break; case '/': b = new StringBuilder(); b.append("//"); while ((ic = reader.read()) != '\n') { if (ic == -1) { break; } b.append((char) ic); } // ignore the // comment // tokenEntries.add(new TokenEntry(b.toString(), // sourceCode.getFileName(), line)); break; case '=': return new Token("/=", line); default: reader.unread(ic); return new Token("/", line); } break; default: // [a-zA-Z_][a-zA-Z_0-9]* if (Character.isJavaIdentifierStart(c)) { b = new StringBuilder(); do { b.append(c); ic = reader.read(); c = (char) ic; } while (Character.isJavaIdentifierPart(c)); reader.unread(ic); return new Token(b.toString(), line); } else if (Character.isDigit(c) || c == '.') { // numbers b = new StringBuilder(); do { b.append(c); if (c == 'e' || c == 'E') { ic = reader.read(); c = (char) ic; if ("1234567890-".indexOf(c) == -1) { break; } b.append(c); } ic = reader.read(); c = (char) ic; } while ("1234567890.iIlLfFdDsSuUeExX".indexOf(c) != -1); reader.unread(ic); return new Token(b.toString(), line); } else { // anything else return new Token(String.valueOf(c), line); } } } } catch (IOException e) { e.printStackTrace(); } endOfFile = true; return Token.EOF; } @Override public void close() throws IOException { reader.close(); } } private static class Token { public static final Token EOF = new Token("EOF", -1); public final String image; public final int lineNumber; Token(String image, int lineNumber) { this.image = image; this.lineNumber = lineNumber; } } }