package edu.stanford.nlp.pipeline;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.IOException;
import java.util.*;
import java.util.function.Consumer;
/**
* This class is designed to apply multiple Annotators
* to an Annotation. The idea is that you first
* build up the pipeline by adding Annotators, and then
* you take the objects you wish to annotate and pass
* them in and get back in return a fully annotated object.
* Please see the package level javadoc for sample usage
* and a more complete description.
*
* @author Jenny Finkel
*/
public class AnnotationPipeline implements Annotator {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(AnnotationPipeline.class);
protected static final boolean TIME = true;
private final List<Annotator> annotators;
private List<MutableLong> accumulatedTime;
public AnnotationPipeline(List<Annotator> annotators) {
this.annotators = annotators;
if (TIME) {
int num = annotators.size();
accumulatedTime = new ArrayList<>(num);
for (int i = 0; i < num; i++) {
accumulatedTime.add(new MutableLong());
}
}
}
public AnnotationPipeline() {
this(new ArrayList<>()); // It can't be a singletonList() since it isn't copied but is mutated.
}
public void addAnnotator(Annotator annotator) {
annotators.add(annotator);
if (TIME) {
accumulatedTime.add(new MutableLong());
}
}
/**
* Run the pipeline on an input annotation.
* The annotation is modified in place.
*
* @param annotation The input annotation, usually a raw document
*/
@Override
public void annotate(Annotation annotation) {
Iterator<MutableLong> it = accumulatedTime.iterator();
Timing t = new Timing();
for (Annotator annotator : annotators) {
if (Thread.interrupted()) { // Allow interrupting
throw new RuntimeInterruptedException();
}
if (TIME) {
t.start();
}
annotator.annotate(annotation);
if (TIME) {
long elapsed = t.stop();
MutableLong m = it.next();
m.incValue(elapsed);
}
}
}
/**
* Annotate a collection of input annotations IN PARALLEL, making use of
* all available cores.
*
* @param annotations The input annotations to process
*/
public void annotate(Iterable<Annotation> annotations) {
annotate(annotations, Runtime.getRuntime().availableProcessors());
}
/**
* Annotate a collection of input annotations IN PARALLEL, making use of
* all available cores.
*
* @param annotations The input annotations to process
* @param callback A function to be called when an annotation finishes.
* The return value of the callback is ignored.
*/
public void annotate(final Iterable<Annotation> annotations, final Consumer<Annotation> callback) {
annotate(annotations, Runtime.getRuntime().availableProcessors(), callback);
}
/**
* Annotate a collection of input annotations IN PARALLEL, making use of
* threads given in numThreads.
*
* @param annotations The input annotations to process
* @param numThreads The number of threads to run on
*/
public void annotate(final Iterable<Annotation> annotations, int numThreads) {
annotate(annotations, numThreads, in -> {});
}
/**
* Annotate a collection of input annotations IN PARALLEL, making use of
* threads given in numThreads
* @param annotations The input annotations to process
* @param numThreads The number of threads to run on
* @param callback A function to be called when an annotation finishes.
* The return value of the callback is ignored.
*/
public void annotate(final Iterable<Annotation> annotations, int numThreads, final Consumer<Annotation> callback){
// case: single thread (no point in spawning threads)
if(numThreads == 1) {
for(Annotation ann : annotations) {
annotate(ann);
callback.accept(ann);
}
}
// Java's equivalent to ".map{ lambda(annotation) => annotate(annotation) }
Iterable<Runnable> threads = () -> {
final Iterator<Annotation> iter = annotations.iterator();
return new Iterator<Runnable>() {
@Override
public boolean hasNext() {
return iter.hasNext();
}
@Override
public Runnable next() {
if ( ! iter.hasNext()) {
throw new NoSuchElementException();
}
final Annotation input = iter.next();
return () -> {
//(logging)
String beginningOfDocument = input.toString().substring(0,Math.min(50,input.toString().length()));
Redwood.startTrack("Annotating \"" + beginningOfDocument + "...\"");
//(annotate)
annotate(input);
//(callback)
callback.accept(input);
//(logging again)
Redwood.endTrack("Annotating \"" + beginningOfDocument + "...\"");
};
}
@Override
public void remove() {
iter.remove();
}
};
};
// Thread
Redwood.Util.threadAndRun(this.getClass().getSimpleName(), threads, numThreads );
}
/** Return the total pipeline annotation time in milliseconds.
*
* @return The total pipeline annotation time in milliseconds
*/
protected long getTotalTime() {
long total = 0;
for (MutableLong m: accumulatedTime) {
total += m.longValue();
}
return total;
}
/** Return a String that gives detailed human-readable information about
* how much time was spent by each annotator and by the entire annotation
* pipeline. This String includes newline characters but does not end
* with one, and so it is suitable to be printed out with a
* {@code println()}.
*
* @return Human readable information on time spent in processing.
*/
public String timingInformation() {
StringBuilder sb = new StringBuilder();
if (TIME) {
sb.append("Annotation pipeline timing information:");
sb.append(IOUtils.eolChar);
Iterator<MutableLong> it = accumulatedTime.iterator();
long total = 0;
for (Annotator annotator : annotators) {
MutableLong m = it.next();
sb.append(StringUtils.getShortClassName(annotator)).append(": ");
sb.append(Timing.toSecondsString(m.longValue())).append(" sec.");
sb.append(IOUtils.eolChar);
total += m.longValue();
}
sb.append("TOTAL: ").append(Timing.toSecondsString(total)).append(" sec.");
}
return sb.toString();
}
@Override
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
Set<Class<? extends CoreAnnotation>> satisfied = Generics.newHashSet();
for (Annotator annotator : annotators) {
satisfied.addAll(annotator.requirementsSatisfied());
}
return satisfied;
}
@Override
public Set<Class<? extends CoreAnnotation>> requires() {
if (annotators.isEmpty()) {
return Collections.emptySet();
}
return annotators.get(0).requires();
}
public static void main(String[] args) throws IOException, ClassNotFoundException {
Timing tim = new Timing();
AnnotationPipeline ap = new AnnotationPipeline();
final boolean verbose = false;
ap.addAnnotator(new TokenizerAnnotator(verbose, "en"));
ap.addAnnotator(new WordsToSentencesAnnotator(verbose));
// ap.addAnnotator(new NERCombinerAnnotator(verbose));
// ap.addAnnotator(new OldNERAnnotator(verbose));
// ap.addAnnotator(new NERMergingAnnotator(verbose));
ap.addAnnotator(new ParserAnnotator(verbose, -1));
/*
ap.addAnnotator(new UpdateSentenceFromParseAnnotator(verbose));
ap.addAnnotator(new NumberAnnotator(verbose));
ap.addAnnotator(new QuantifiableEntityNormalizingAnnotator(verbose));
ap.addAnnotator(new StemmerAnnotator(verbose));
ap.addAnnotator(new MorphaAnnotator(verbose));
**/
// ap.addAnnotator(new SRLAnnotator());
String text = ("USAir said in the filings that Mr. Icahn first contacted Mr. Colodny last September to discuss the benefits of combining TWA and USAir -- either by TWA's acquisition of USAir, or USAir's acquisition of TWA.");
Annotation a = new Annotation(text);
ap.annotate(a);
System.out.println(a.get(CoreAnnotations.TokensAnnotation.class));
for (CoreMap sentence : a.get(CoreAnnotations.SentencesAnnotation.class)) {
System.out.println(sentence.get(TreeCoreAnnotations.TreeAnnotation.class));
}
if (TIME) {
System.out.println(ap.timingInformation());
log.info("Total time for AnnotationPipeline: " +
tim.toSecondsString() + " sec.");
}
}
}