SzTEAnnotationPipeline.java example

Explorer
kpe-master
- src
  - edu
    - stanford
      - nlp
        pipeline
        HunTokenizerAnnotator.java
        MweDictAnnotator.java
        MyCleanXmlAnnotator.java
        NormalizerAnnotator.java
        OwnMorphaAnnotator.java
        OwnPOSTaggerAnnotator.java
        StopWordAnnotator.java
        SzTEAnnotationPipeline.java
        SzTECoreNLP.java
        process
        HunPTBLexer.java
        HunTokenizer.java
        tagger
        maxent
        OwnMaxentTagger.java
        OwnTestSentence.java
  - hu
    - u_szeged
package edu.stanford.nlp.pipeline;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;

import edu.stanford.nlp.util.Function;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.MutableLong;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.Timing;
import edu.stanford.nlp.util.logging.Redwood;

/**
 * This class is designed to apply multiple Annotators to an Annotation. The idea is that you first build up the pipeline by adding Annotators, and
 * then you take the objects you wish to annotate and pass them in and get back in return a fully annotated object. Please see the package level
 * javadoc for sample usage and a more complete description.
 * 
 * @author Jenny Finkel
 */

public class SzTEAnnotationPipeline implements Annotator {

  protected static final boolean TIME = true;

  private final List<Annotator> annotators;
  private List<MutableLong> accumulatedTime;

  public SzTEAnnotationPipeline(List<Annotator> annotators) {
    this.annotators = annotators;
    if (TIME) {
      int num = annotators.size();
      accumulatedTime = new ArrayList<MutableLong>(num);
      for (int i = 0; i < num; i++) {
        accumulatedTime.add(new MutableLong());
      }
    }
  }

  public SzTEAnnotationPipeline() {
    this(new ArrayList<Annotator>());
  }

  public void addAnnotator(Annotator annotator) {
    annotators.add(annotator);
    if (TIME) {
      accumulatedTime.add(new MutableLong());
    }
  }

  public void annotate(Annotation annotation, Set<String> subAnnotators) {
    Iterator<MutableLong> it = accumulatedTime.iterator();
    Timing t = new Timing();
    for (Annotator annotator : annotators) {
      if (subAnnotators == null || subAnnotators.contains(annotator.getClass().getSimpleName().replace("Annotator", ""))) {
        if (TIME) {
          t.start();
        }
        annotator.annotate(annotation);
        if (TIME) {
          int elapsed = (int) t.stop();
          MutableLong m = it.next();
          m.incValue(elapsed);
        }
      }
    }
  }

  // ->EXTENSION

  /**
   * Run the pipeline on an input annotation. The annotation is modified in place
   * 
   * @param annotation
   *          The input annotation, usually a raw document
   */
  public void annotate(Annotation annotation) {
    annotate(annotation, null);
  }

  /**
   * Annotate a collection of input annotations IN PARALLEL, making use of all available cores.
   * 
   * @param annotations
   *          The input annotations to process
   */
  public void annotate(Iterable<Annotation> annotations) {
    annotate(annotations, Runtime.getRuntime().availableProcessors());
  }

  /**
   * Annotate a collection of input annotations IN PARALLEL, making use of all available cores.
   * 
   * @param annotations
   *          The input annotations to process
   * @param callback
   *          A function to be called when an annotation finishes. The return value of the callback is ignored.
   */
  public void annotate(final Iterable<Annotation> annotations, final Function<Annotation, Object> callback) {
    annotate(annotations, Runtime.getRuntime().availableProcessors(), callback);
  }

  /**
   * Annotate a collection of input annotations IN PARALLEL, making use of threads given in numThreads.
   * 
   * @param annotations
   *          The input annotations to process
   * @param numThreads
   *          The number of threads to run on
   */
  public void annotate(final Iterable<Annotation> annotations, int numThreads) {
    annotate(annotations, numThreads, new Function<Annotation, Object>() {
      @Override
      public Object apply(Annotation in) {
        return null;
      }
    });
  }

  /**
   * Annotate a collection of input annotations IN PARALLEL, making use of threads given in numThreads
   * 
   * @param annotations
   *          The input annotations to process
   * @param numThreads
   *          The number of threads to run on
   * @param callback
   *          A function to be called when an annotation finishes. The return value of the callback is ignored.
   */
  public void annotate(final Iterable<Annotation> annotations, int numThreads, final Function<Annotation, Object> callback) {
    // case: single thread (no point in spawning threads)
    if (numThreads == 1) {
      for (Annotation ann : annotations) {
        annotate(ann);
        callback.apply(ann);
      }
    }
    // Java's equivalent to ".map{ lambda(annotation) => annotate(annotation) }
    Iterable<Runnable> threads = new Iterable<Runnable>() {
      @Override
      public Iterator<Runnable> iterator() {
        final Iterator<Annotation> iter = annotations.iterator();
        return new Iterator<Runnable>() {
          @Override
          public boolean hasNext() {
            return iter.hasNext();
          }

          @Override
          public Runnable next() {
            if (!iter.hasNext()) {
              throw new NoSuchElementException();
            }
            final Annotation input = iter.next();
            return new Runnable() {
              @Override
              public void run() {
                // (logging)
                String beginningOfDocument = input.toString().substring(0, Math.min(50, input.toString().length()));
                Redwood.startTrack("Annotating \"" + beginningOfDocument + "...\"");
                // (annotate)
                annotate(input);
                // (callback)
                callback.apply(input);
                // (logging again)
                Redwood.endTrack("Annotating \"" + beginningOfDocument + "...\"");
              }
            };
          }

          @Override
          public void remove() {
            iter.remove();
          }
        };
      }
    };
    // Thread
    Redwood.Util.threadAndRun(this.getClass().getSimpleName(), threads, numThreads);
  }

  /**
   * Return the total pipeline annotation time in milliseconds.
   * 
   * @return The total pipeline annotation time in milliseconds
   */
  protected long getTotalTime() {
    long total = 0;
    for (MutableLong m : accumulatedTime) {
      total += m.longValue();
    }
    return total;
  }

  /**
   * Return a String that gives detailed human-readable information about how much time was spent by each annotator and by the entire annotation
   * pipeline. This String includes newline characters but does not end with one, and so it is suitable to be printed out with a {@code println()}.
   * 
   * @return Human readable information on time spent in processing.
   */
  public String timingInformation() {
    StringBuilder sb = new StringBuilder();
    if (TIME) {
      sb.append("Annotation pipeline timing information:\n");
      Iterator<MutableLong> it = accumulatedTime.iterator();
      long total = 0;
      for (Annotator annotator : annotators) {
        MutableLong m = it.next();
        sb.append(StringUtils.getShortClassName(annotator)).append(": ");
        sb.append(Timing.toSecondsString(m.longValue())).append(" sec.\n");
        total += m.longValue();
      }
      sb.append("TOTAL: ").append(Timing.toSecondsString(total)).append(" sec.");
    }
    return sb.toString();
  }

  @Override
  public Set<Requirement> requirementsSatisfied() {
    Set<Requirement> satisfied = Generics.newHashSet();
    for (Annotator annotator : annotators) {
      satisfied.addAll(annotator.requirementsSatisfied());
    }
    return satisfied;
  }

  @Override
  public Set<Requirement> requires() {
    if (annotators.isEmpty()) {
      return Collections.emptySet();
    }
    return annotators.get(0).requires();
  }

}