package edu.stanford.nlp.pipeline;
import java.io.Reader;
import java.io.StringReader;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.util.Timing;
/**
* This is an abstract base class for any annotator class that uses a
* Tokenizer to split TextAnnotation into TokensAnnotation.
* <br>
* The only method the subclass needs to define is a method
* that produces a Tokenizer of CoreLabels, which is then
* used to split the TextAnnotation of the given Annotation
* into CoreLabels.
* <br>
* In order to maintain thread safety, getTokenizer should return
* a thread-safe tokenizer. In the case of tokenizers built from
* .flex files, that will mean new tokenizers for each call.
*
* @author Jenny Finkel
* @author John Bauer
*/
public abstract class TokenizerAnnotator implements Annotator {
private final boolean VERBOSE;
public TokenizerAnnotator(boolean verbose) {
VERBOSE = verbose;
}
/**
* Abstract: returns a tokenizer
*/
abstract public Tokenizer<CoreLabel> getTokenizer(Reader r);
/**
* Does the actual work of splitting TextAnnotation into CoreLabels,
* which are then attached to the TokensAnnotation.
*/
@Override
public void annotate(Annotation annotation) {
Timing timer = null;
if (VERBOSE) {
timer = new Timing();
timer.start();
System.err.print("Tokenizing ... ");
}
if (annotation.has(CoreAnnotations.TextAnnotation.class)) {
String text = annotation.get(CoreAnnotations.TextAnnotation.class);
Reader r = new StringReader(text); // don't wrap in BufferedReader. It gives you nothing for in memory String unless you need the readLine() method!
List<CoreLabel> tokens = getTokenizer(r).tokenize();
// cdm 2010-05-15: This is now unnecessary, as it is done in CoreLabelTokenFactory
// for (CoreLabel token: tokens) {
// token.set(CoreAnnotations.TextAnnotation.class, token.get(CoreAnnotations.TextAnnotation.class));
// }
annotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
if (VERBOSE) {
timer.stop("done.");
System.err.println("output: " + annotation.get(CoreAnnotations.TokensAnnotation.class));
System.err.println();
}
} else {
throw new RuntimeException("unable to find text in annotation: " + annotation);
}
}
@Override
public Set<Requirement> requires() {
return Collections.emptySet();
}
@Override
public Set<Requirement> requirementsSatisfied() {
return Collections.singleton(TOKENIZE_REQUIREMENT);
}
}