/*
*
* Copyright (c) 2004-2011 by the TeXlapse Team.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*/
package net.sourceforge.texlipse.texparser;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.StringReader;
import net.sourceforge.texlipse.texparser.lexer.LexerException;
import net.sourceforge.texlipse.texparser.node.EOF;
import net.sourceforge.texlipse.texparser.node.TArgument;
import net.sourceforge.texlipse.texparser.node.TCchapter;
import net.sourceforge.texlipse.texparser.node.TCcite;
import net.sourceforge.texlipse.texparser.node.TCommentline;
import net.sourceforge.texlipse.texparser.node.TCparagraph;
import net.sourceforge.texlipse.texparser.node.TCpart;
import net.sourceforge.texlipse.texparser.node.TCsection;
import net.sourceforge.texlipse.texparser.node.TCssection;
import net.sourceforge.texlipse.texparser.node.TCsssection;
import net.sourceforge.texlipse.texparser.node.TOptargument;
import net.sourceforge.texlipse.texparser.node.TStar;
import net.sourceforge.texlipse.texparser.node.TWhitespace;
import net.sourceforge.texlipse.texparser.node.TWord;
import net.sourceforge.texlipse.texparser.node.Token;
/**
* A LaTeX word counting parser. Counts the likely printed words from
* the given string, ie. all normal words and the contents of sectioning
* commands are counted. Cite-references are counted as one word.
*
* @author Oskar Ojala
*/
public class LatexWordCounter {
private String selection;
/**
* Creates new word counter with a string with words to count.
*
* @param selection The string to use for counting words
*/
public LatexWordCounter(String selection) {
this.selection = selection;
}
/**
* Counts the number of (LaTeX) words in the string that this
* object contains.
*
* @return The number of words or -1 on an error
*/
public int countWords() {
try {
LatexLexer lexer = new LatexLexer(new PushbackReader(new StringReader(selection), 4096));
int words = 0;
boolean expectArg = false;
for (Token t = lexer.next(); !(t instanceof EOF); t = lexer.next()) {
if (expectArg) {
if (t instanceof TArgument) {
words += t.getText().split("\\s+").length;
expectArg = false;
} else if (!(t instanceof TOptargument) && !(t instanceof TWhitespace)
&& !(t instanceof TStar) && !(t instanceof TCommentline)) {
// this is an error state, but we'll skip it
expectArg = false;
}
} else {
if (t instanceof TWord || t instanceof TCcite) {
if (!"&".equals(t.getText()))
words++;
} else if (t instanceof TWhitespace) { // make the common case fast
continue;
} else if (t instanceof TCpart || t instanceof TCchapter
|| t instanceof TCsection || t instanceof TCssection
|| t instanceof TCsssection || t instanceof TCparagraph)
expectArg = true;
}
}
return words;
} catch (IOException e) {
return -1;
} catch (LexerException e) {
return -1;
}
}
}