package edu.stanford.nlp.pipeline;
import java.io.Reader;
import java.util.Properties;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.WhitespaceTokenizer;
/**
* This annotator uses a WhitespaceTokenizer to split TextAnnotations
* into TokensAnnotations.
* <br>
* If either the property EOL_PROPERTY or the property
* NEWLINE_SPLITTER_PROPERTY defined in StanfordCoreNLP are present
* and set to true, newlines are returned as tokens. In practice,
* either will mean the newlines get removed by the sentence splitter.
*
* @author John Bauer
*/
public class WhitespaceTokenizerAnnotator extends TokenizerAnnotator {
private final TokenizerFactory<CoreLabel> factory;
public static final String EOL_PROPERTY = "tokenize.keepeol";
public WhitespaceTokenizerAnnotator(Properties props) {
super(false);
boolean eolIsSignificant =
Boolean.valueOf(props.getProperty(EOL_PROPERTY, "false"));
eolIsSignificant =
(eolIsSignificant ||
Boolean.valueOf(props.getProperty
(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "false")));
factory = new WhitespaceTokenizer.WhitespaceTokenizerFactory<CoreLabel>
(new CoreLabelTokenFactory(), eolIsSignificant);
}
@Override
public Tokenizer<CoreLabel> getTokenizer(Reader r) {
return factory.getTokenizer(r);
}
}