HunTokenizerAnnotator.java example

Explorer

kpe-master
- src
  - edu
    - stanford
      - nlp
        pipeline
        HunTokenizerAnnotator.java
        MweDictAnnotator.java
        MyCleanXmlAnnotator.java
        NormalizerAnnotator.java
        OwnMorphaAnnotator.java
        OwnPOSTaggerAnnotator.java
        StopWordAnnotator.java
        SzTEAnnotationPipeline.java
        SzTECoreNLP.java
        process
        HunPTBLexer.java
        HunTokenizer.java
        tagger
        maxent
        OwnMaxentTagger.java
        OwnTestSentence.java
  - hu
    - u_szeged

package edu.stanford.nlp.pipeline;

import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.HunTokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.util.ArraySet;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.Timing;

/**
 * This class will PTB tokenize the input string. There's at present an old and a new version (both are tried). The old version looks for an
 * original String or List<String> is under the Annotation.OriginalStringPLAnnotation and it will add the output from the
 * InvertiblePTBTokenizer (List<CoreLabel>) under Annotation.WordsPLAnnotation and it will make a copy, as a List<CoreLabel>
 * which it will put under Annotation.OriginalWordsPLAnnotation. The new version assumes that the original String or List<String> is
 * under the Annotation.TextAnnotation field and it will add the output from the InvertiblePTBTokenizer (List<CoreLabel>) under
 * Annotation.TokensAnnotation. If the original input was a List, then it will make a List<List<CoreLabel>>, and otherwise it
 * will make a List<CoreLabel< for both of these entries. The reason why there are copies is so that future Annotators can muck around
 * with the copy under WORDS_KEY, but can still access the original words if necessary.
 * 
 * @author Jenny Finkel
 */
public class HunTokenizerAnnotator implements Annotator {

  private TokenizerFactory<CoreLabel> factory = HunTokenizer.factory(false, true);
  private Timing timer = new Timing();

  private boolean VERBOSE = true;

  private String options = "invertible,ptb3Escaping=true";

  public HunTokenizerAnnotator() {
    this(true);
  }

  public HunTokenizerAnnotator(boolean verbose) {
    this(verbose, "invertible,ptb3Escaping=true");
  }

  public HunTokenizerAnnotator(String options) {
    this(true, options);
  }

  public HunTokenizerAnnotator(boolean verbose, String options) {
    this.VERBOSE = verbose;
    this.options = options;
    factory = HunTokenizer.factory(new CoreLabelTokenFactory(), this.options);
  }

  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      timer.start();
      System.err.print("Hun tokenizing ... ");
    }

    if (annotation.has(CoreAnnotations.TextAnnotation.class)) {
      String text = annotation.get(CoreAnnotations.TextAnnotation.class);
      Reader r = new StringReader(text); // don't wrap in BufferedReader. It gives you nothing for in memory String
                                         // unless you need the readLine() method!
      List<CoreLabel> tokens = this.factory.getTokenizer(r).tokenize();
      // cdm 2010-05-15: This is now unnecessary, as it is done in CoreLabelTokenFactory
      // for (CoreLabel token: tokens) {
      // token.set(CoreAnnotations.TextAnnotation.class, token.get(TextAnnotation.class));
      // }
      annotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
      if (VERBOSE) {
        timer.stop("done.");
        System.err.println("output: " + annotation.get(CoreAnnotations.TokensAnnotation.class) + "\n");
      }
    } else {
      throw new RuntimeException("unable to find text in annotation: " + annotation);
    }
  }

  public Pair<List<CoreLabel>, List<CoreLabel>> doOneSentence(String origText) {
    Reader r = new StringReader(origText);
    List<CoreLabel> words = factory.getTokenizer(r).tokenize();
    List<CoreLabel> wordsCopy = new ArrayList<CoreLabel>();
    for (CoreLabel w : words) {
      CoreLabel fl = new CoreLabel(w);
      wordsCopy.add(fl);
    }
    return new Pair<List<CoreLabel>, List<CoreLabel>>(words, wordsCopy);
  }

  @Override
  public Set<Requirement> requires() {
    return Collections.unmodifiableSet(new ArraySet<Requirement>());
  }

  @Override
  public Set<Requirement> requirementsSatisfied() {
    return Collections.singleton(TOKENIZE_REQUIREMENT);
  }

}