package edu.stanford.nlp.pipeline;
import java.io.Reader;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
/**
* This class will PTB tokenize the input. It assumes that the original
* String is under the CoreAnnotations.TextAnnotation field
* and it will add the output from the
* InvertiblePTBTokenizer ({@code List<CoreLabel>}) under
* CoreAnnotation.TokensAnnotation.
*
* @author Jenny Finkel
* @author Christopher Manning
*/
public class PTBTokenizerAnnotator extends TokenizerAnnotator {
private final TokenizerFactory<CoreLabel> factory;
public static final String DEFAULT_OPTIONS = "invertible,ptb3Escaping=true";
public PTBTokenizerAnnotator() {
this(true);
}
public PTBTokenizerAnnotator(boolean verbose) {
this(verbose, DEFAULT_OPTIONS);
}
public PTBTokenizerAnnotator(boolean verbose, String options) {
super(verbose);
factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options);
}
@Override
public Tokenizer<CoreLabel> getTokenizer(Reader r) {
return factory.getTokenizer(r);
}
}