package edu.stanford.nlp.pipeline; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Set; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.process.CoreLabelTokenFactory; import edu.stanford.nlp.process.HunTokenizer; import edu.stanford.nlp.process.TokenizerFactory; import edu.stanford.nlp.util.ArraySet; import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.Timing; /** * This class will PTB tokenize the input string. There's at present an old and a new version (both are tried). The old version looks for an * original String or List<String> is under the Annotation.OriginalStringPLAnnotation and it will add the output from the * InvertiblePTBTokenizer (List<CoreLabel>) under Annotation.WordsPLAnnotation and it will make a copy, as a List<CoreLabel> * which it will put under Annotation.OriginalWordsPLAnnotation. The new version assumes that the original String or List<String> is * under the Annotation.TextAnnotation field and it will add the output from the InvertiblePTBTokenizer (List<CoreLabel>) under * Annotation.TokensAnnotation. If the original input was a List, then it will make a List<List<CoreLabel>>, and otherwise it * will make a List<CoreLabel< for both of these entries. The reason why there are copies is so that future Annotators can muck around * with the copy under WORDS_KEY, but can still access the original words if necessary. * * @author Jenny Finkel */ public class HunTokenizerAnnotator implements Annotator { private TokenizerFactory<CoreLabel> factory = HunTokenizer.factory(false, true); private Timing timer = new Timing(); private boolean VERBOSE = true; private String options = "invertible,ptb3Escaping=true"; public HunTokenizerAnnotator() { this(true); } public HunTokenizerAnnotator(boolean verbose) { this(verbose, "invertible,ptb3Escaping=true"); } public HunTokenizerAnnotator(String options) { this(true, options); } public HunTokenizerAnnotator(boolean verbose, String options) { this.VERBOSE = verbose; this.options = options; factory = HunTokenizer.factory(new CoreLabelTokenFactory(), this.options); } public void annotate(Annotation annotation) { if (VERBOSE) { timer.start(); System.err.print("Hun tokenizing ... "); } if (annotation.has(CoreAnnotations.TextAnnotation.class)) { String text = annotation.get(CoreAnnotations.TextAnnotation.class); Reader r = new StringReader(text); // don't wrap in BufferedReader. It gives you nothing for in memory String // unless you need the readLine() method! List<CoreLabel> tokens = this.factory.getTokenizer(r).tokenize(); // cdm 2010-05-15: This is now unnecessary, as it is done in CoreLabelTokenFactory // for (CoreLabel token: tokens) { // token.set(CoreAnnotations.TextAnnotation.class, token.get(TextAnnotation.class)); // } annotation.set(CoreAnnotations.TokensAnnotation.class, tokens); if (VERBOSE) { timer.stop("done."); System.err.println("output: " + annotation.get(CoreAnnotations.TokensAnnotation.class) + "\n"); } } else { throw new RuntimeException("unable to find text in annotation: " + annotation); } } public Pair<List<CoreLabel>, List<CoreLabel>> doOneSentence(String origText) { Reader r = new StringReader(origText); List<CoreLabel> words = factory.getTokenizer(r).tokenize(); List<CoreLabel> wordsCopy = new ArrayList<CoreLabel>(); for (CoreLabel w : words) { CoreLabel fl = new CoreLabel(w); wordsCopy.add(fl); } return new Pair<List<CoreLabel>, List<CoreLabel>>(words, wordsCopy); } @Override public Set<Requirement> requires() { return Collections.unmodifiableSet(new ArraySet<Requirement>()); } @Override public Set<Requirement> requirementsSatisfied() { return Collections.singleton(TOKENIZE_REQUIREMENT); } }