/** * BSD-style license; for more info see http://pmd.sourceforge.net/license.html */ package net.sourceforge.pmd.cpd; import java.io.BufferedReader; import java.io.CharArrayReader; import java.util.NoSuchElementException; import java.util.StringTokenizer; import org.apache.commons.io.IOUtils; /** * This class does a best-guess try-anything tokenization. * * @author jheintz */ public class AnyTokenizer implements Tokenizer { public static final String TOKENS = " \t!#$%^&*(){}-=+<>/\\`~;:"; @Override public void tokenize(SourceCode sourceCode, Tokens tokenEntries) { StringBuilder sb = sourceCode.getCodeBuffer(); BufferedReader reader = new BufferedReader(new CharArrayReader(sb.toString().toCharArray())); try { int lineNumber = 1; String line = reader.readLine(); while (line != null) { StringTokenizer tokenizer = new StringTokenizer(line, TOKENS, true); try { String token = tokenizer.nextToken(); while (token != null) { if (!" ".equals(token) && !"\t".equals(token)) { tokenEntries.add(new TokenEntry(token, sourceCode.getFileName(), lineNumber)); } token = tokenizer.nextToken(); } } catch (NoSuchElementException ex) { // done with tokens } // advance iteration variables line = reader.readLine(); lineNumber++; } } catch (Exception ex) { ex.printStackTrace(); } finally { IOUtils.closeQuietly(reader); tokenEntries.add(TokenEntry.getEOF()); } } }