AnnotationUtils.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.ie.machinereading.structure; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.Set;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.StringUtils;

/**
 * Utilities to manipulate Annotations storing datasets or sentences with Machine Reading info
 * @author Mihai
 *
 */
public class AnnotationUtils  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(AnnotationUtils.class);
  private AnnotationUtils() {} // only static methods

  /**
   * Given a list of sentences (as CoreMaps), wrap it in a new Annotation.
   */
  public static Annotation createDataset(List<CoreMap> sentences) {
    Annotation dataset = new Annotation("");
    addSentences(dataset,sentences);
    return dataset;
  }

  /**
   * Randomized shuffle of all sentences int this dataset
   * @param dataset
   */
  public static void shuffleSentences(CoreMap dataset) {
    List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class);

    // we use a constant seed for replicability of experiments
    Collections.shuffle(sentences, new Random(0));

    dataset.set(CoreAnnotations.SentencesAnnotation.class, sentences);
  }

  /**
   * Converts the labels of all entity mentions in this dataset to sequences of CoreLabels
   * @param dataset
   * @param annotationsToSkip
   * @param useSubTypes
   */
  public static List<List<CoreLabel>> entityMentionsToCoreLabels(CoreMap dataset, Set<String> annotationsToSkip, boolean useSubTypes, boolean useBIO) {
    List<List<CoreLabel>> retVal = new ArrayList<>();
    List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class);

    for (CoreMap sentence : sentences) {
      List<CoreLabel> labeledSentence = sentenceEntityMentionsToCoreLabels(sentence, true, annotationsToSkip, null, useSubTypes, useBIO);
      assert(labeledSentence != null);
      retVal.add(labeledSentence);
    }

    return retVal;
  }

  /**
   * Converts the labels of all entity mentions in this sentence to sequences of CoreLabels
   * @param sentence
   * @param addAnswerAnnotation
   * @param annotationsToSkip
   * @param useSubTypes
   */
  public static List<CoreLabel> sentenceEntityMentionsToCoreLabels(
      CoreMap sentence,
      boolean addAnswerAnnotation,
      Set<String> annotationsToSkip,
      Set<String> mentionTypesToUse,
      boolean useSubTypes,
      boolean useBIO) {
    /*
    Tree completeTree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
    if(completeTree == null){
      throw new RuntimeException("ERROR: TreeAnnotation MUST be set before calling this method!");
    }
    */

    //
    // Set TextAnnotation and PartOfSpeechAnnotation (using the parser data)
    //
    /*
    List<CoreLabel> labels = new ArrayList<CoreLabel>();
    List<Tree> tokenList = completeTree.getLeaves();
    for (Tree tree : tokenList) {
      Word word = new Word(tree.label());
      CoreLabel label = new CoreLabel();
      label.set(CoreAnnotations.TextAnnotation.class, word.value());
      if (addAnswerAnnotation) {
        label.set(CoreAnnotations.AnswerAnnotation.class,
            SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL);
      }
      label.set(CoreAnnotations.PartOfSpeechAnnotation.class, tree.parent(completeTree).label().value());
      labels.add(label);
    }
    */
    // use the token CoreLabels not the parser data => more robust
    List<CoreLabel> labels = new ArrayList<>();
    for(CoreLabel l: sentence.get(CoreAnnotations.TokensAnnotation.class)){
      CoreLabel nl = new CoreLabel(l);
      if (addAnswerAnnotation) {
        nl.set(CoreAnnotations.AnswerAnnotation.class, SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL);
      }
      labels.add(nl);
    }

    // Add AnswerAnnotation from the types of the entity mentions
    if (addAnswerAnnotation) {
      List<EntityMention> entities = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
      if(entities != null){
        for (EntityMention entity : entities) {
          // is this a type that we should skip?
          if(annotationsToSkip != null && annotationsToSkip.contains(entity.getType())) continue;
          // is this a valid mention type?
          if(mentionTypesToUse != null && ! mentionTypesToUse.contains(entity.getMentionType())) continue;

          // ignore entities without head span
          if(entity.getHead() != null){
            for(int i = entity.getHeadTokenStart(); i < entity.getHeadTokenEnd(); i ++){
              String tag = entity.getType();
              if(useSubTypes && entity.getSubType() != null) tag += "-" + entity.getSubType();
              if(useBIO){
                if(i == entity.getHeadTokenStart()) tag = "B-" + tag;
                else tag = "I-" + tag;
              }
              labels.get(i).set(CoreAnnotations.AnswerAnnotation.class, tag);
            }
          }
        }
      }
    }

    /*
    // Displaying the CoreLabels generated for this sentence
    log.info("sentence to core labels:");
    for(CoreLabel l: labels){
      log.info(" " + l.word() + "/" + l.getString(CoreAnnotations.PartOfSpeechAnnotation.class));
      String tag = l.getString(CoreAnnotations.AnswerAnnotation.class);
      if(tag != null && ! tag.equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL)){
        log.info("/" + tag);
      }
    }
    log.info();
    */

    return labels;
  }

  public static CoreMap getSentence(CoreMap dataset, int i) {
    return dataset.get(CoreAnnotations.SentencesAnnotation.class).get(i);
  }

  public static int sentenceCount(CoreMap dataset) {
    List<CoreMap> sents = dataset.get(CoreAnnotations.SentencesAnnotation.class);
    if(sents != null) return sents.size();
    return 0;
  }

  public static void addSentence(CoreMap dataset, CoreMap sentence) {
    List<CoreMap> sents = dataset.get(CoreAnnotations.SentencesAnnotation.class);
    if(sents == null){
      sents = new ArrayList<>();
      dataset.set(CoreAnnotations.SentencesAnnotation.class, sents);
    }
    sents.add(sentence);
  }

  public static void addSentences(CoreMap dataset, List<CoreMap> sentences) {
    List<CoreMap> sents = dataset.get(CoreAnnotations.SentencesAnnotation.class);
    if(sents == null){
      sents = new ArrayList<>();
      dataset.set(CoreAnnotations.SentencesAnnotation.class, sents);
    }
    for(CoreMap sentence: sentences){
      sents.add(sentence);
    }
  }

  /**
   * Creates a deep copy of the given dataset with new lists for all mentions (entity, relation, event)
   * @param dataset
   */
  public static Annotation deepMentionCopy(CoreMap dataset) {
    Annotation newDataset = new Annotation("");

    List<CoreMap> sents = dataset.get(CoreAnnotations.SentencesAnnotation.class);
    List<CoreMap> newSents = new ArrayList<>();
    if(sents != null){
      for(CoreMap sent: sents){
        if(! (sent instanceof Annotation)){
          throw new RuntimeException("ERROR: Sentences must instantiate Annotation!");
        }
        CoreMap newSent = sentenceDeepMentionCopy((Annotation) sent);
        newSents.add(newSent);
      }
    }

    addSentences(newDataset, newSents);
    return newDataset;
  }

  /**
   * Deep copy of the sentence: we create new entity/relation/event lists here.
   * However,  we do not deep copy the ExtractionObjects themselves!
   * @param sentence
   */
  public static Annotation sentenceDeepMentionCopy(Annotation sentence) {
    Annotation newSent = new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class));

    newSent.set(CoreAnnotations.TokensAnnotation.class, sentence.get(CoreAnnotations.TokensAnnotation.class));
    newSent.set(TreeCoreAnnotations.TreeAnnotation.class, sentence.get(TreeCoreAnnotations.TreeAnnotation.class));
    newSent.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class));
    newSent.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class));
    newSent.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class));
    newSent.set(CoreAnnotations.DocIDAnnotation.class, sentence.get(CoreAnnotations.DocIDAnnotation.class));

    // deep copy of all mentions lists
    List<EntityMention> ents = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
    if(ents != null) newSent.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, new ArrayList<>(ents));
    List<RelationMention> rels = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
    if(rels != null) newSent.set(MachineReadingAnnotations.RelationMentionsAnnotation.class, new ArrayList<>(rels));
    List<EventMention> evs = sentence.get(MachineReadingAnnotations.EventMentionsAnnotation.class);
    if(evs != null) newSent.set(MachineReadingAnnotations.EventMentionsAnnotation.class, new ArrayList<>(evs));

    return newSent;
  }

  /**
   * Return the relation that holds between the given entities.
   * Return a relation of type UNRELATED if this sentence contains no relation between the entities.
   */
  public static RelationMention getRelation(RelationMentionFactory factory, CoreMap sentence, ExtractionObject ... args) {
    return getRelations(factory, sentence, args).get(0);
  }

  /**
   * Return all the relations that holds between the given entities.
   * Returns a list containing a relation of type UNRELATED if this sentence contains no relation between the entities.
   */
  public static List<RelationMention> getRelations(RelationMentionFactory factory, CoreMap sentence, ExtractionObject... args) {
    List<RelationMention> relationMentions = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
    List<RelationMention> matchingRelationMentions = new ArrayList<>();
    if (relationMentions != null) {
      for (RelationMention rel : relationMentions) {
        if (rel.argsMatch(args)) {
          matchingRelationMentions.add(rel);
        }
      }
    }
    if (matchingRelationMentions.size() == 0) {
      matchingRelationMentions.add(RelationMention.createUnrelatedRelation(factory, args));
    }
    return matchingRelationMentions;
  }

  /**
   * Get list of all relations and non-relations between EntityMentions in this sentence
   * Use with care. This is an expensive call due to getAllUnrelatedRelations, which creates all non-existing relations between all entity mentions
   */
  public static List<RelationMention> getAllRelations(RelationMentionFactory factory, CoreMap sentence, boolean createUnrelatedRelations) {
    List<RelationMention> relationMentions = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
    List<RelationMention> allRelations = new ArrayList<>();
    if(relationMentions != null) allRelations.addAll(relationMentions);
    if(createUnrelatedRelations){
      allRelations.addAll(getAllUnrelatedRelations(factory, sentence, true));
    }
    return allRelations;
  }

  public static List<RelationMention> getAllUnrelatedRelations(RelationMentionFactory factory, CoreMap sentence, boolean checkExisting) {

    List<RelationMention> relationMentions = (checkExisting ? sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class) : null);
    List<EntityMention> entityMentions = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
    List<RelationMention> nonRelations = new ArrayList<>();

    //
    // scan all possible arguments
    //
    if(entityMentions != null){
      for(int i = 0; i < entityMentions.size(); i ++){
        for(int j = 0; j < entityMentions.size(); j ++){
          if(i == j) continue;
          EntityMention arg1 = entityMentions.get(i);
          EntityMention arg2 = entityMentions.get(j);
          boolean match = false;
          if(relationMentions != null){
            for (RelationMention rel : relationMentions) {
              if (rel.argsMatch(arg1, arg2)) {
                match = true;
                break;
              }
            }
          }
          if (match == false) {
          	nonRelations.add(RelationMention.createUnrelatedRelation(factory, arg1,arg2));
          }
        }
      }
    }

    return nonRelations;
  }

  public static void addEntityMention(CoreMap sentence, EntityMention arg) {
    List<EntityMention> l = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
    if(l == null){
      l = new ArrayList<>();
      sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, l);
    }
    l.add(arg);
  }

  public static void addEntityMentions(CoreMap sentence, Collection<EntityMention> args) {
    List<EntityMention> l = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
    if(l == null){
      l = new ArrayList<>();
      sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, l);
    }
    l.addAll(args);
  }

  public List<EntityMention> getEntityMentions(CoreMap sent) {
    return Collections.unmodifiableList(sent.get(MachineReadingAnnotations.EntityMentionsAnnotation.class));
  }

  public static void addRelationMention(CoreMap sentence, RelationMention arg) {
    List<RelationMention> l = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
    if(l == null){
      l = new ArrayList<>();
      sentence.set(MachineReadingAnnotations.RelationMentionsAnnotation.class, l);
    }
    l.add(arg);
  }

  public static void addRelationMentions(CoreMap sentence, Collection<RelationMention> args) {
    List<RelationMention> l = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
    if(l == null){
      l = new ArrayList<>();
      sentence.set(MachineReadingAnnotations.RelationMentionsAnnotation.class, l);
    }
    l.addAll(args);
  }

  public List<RelationMention> getRelationMentions(CoreMap sent) {
    return Collections.unmodifiableList(sent.get(MachineReadingAnnotations.RelationMentionsAnnotation.class));
  }

  public static void addEventMention(CoreMap sentence, EventMention arg) {
    List<EventMention> l = sentence.get(MachineReadingAnnotations.EventMentionsAnnotation.class);
    if(l == null){
      l = new ArrayList<>();
      sentence.set(MachineReadingAnnotations.EventMentionsAnnotation.class, l);
    }
    l.add(arg);
  }

  public static void addEventMentions(CoreMap sentence, Collection<EventMention> args) {
    List<EventMention> l = sentence.get(MachineReadingAnnotations.EventMentionsAnnotation.class);
    if(l == null){
      l = new ArrayList<>();
      sentence.set(MachineReadingAnnotations.EventMentionsAnnotation.class, l);
    }
    l.addAll(args);
  }

  public List<EventMention> getEventMentions(CoreMap sent) {
    return Collections.unmodifiableList(sent.get(MachineReadingAnnotations.EventMentionsAnnotation.class));
  }

	/**
   * Prepare a string for printing in a spreadsheet for Mechanical Turk input.
   * @param s String to be formatted
   * @return String string enclosed in quotes with other quotes escaped, and with better formatting for readability by Turkers.
   */
  public static String prettify(String s) {
    if (s==null) return "";
    return s.replace(
        " ,",",").replace(
            " .",".").replace(
                " :",":").replace(
                    "( ","(").replace(
                        "[ ","[").replace(
                            " )",")").replace(
                                " ]","]").replace(
                                    " - ","-").replace(
                                        " '","'").replace(
                                            "-LRB- ","(").replace(
                                                " -RRB-",")").replace(
                                                    "` ` ","\"").replace(
                                                        " ' '","\"").replace(
                                                            " COMMA",",");
  }

  /**
   * Fetches the sentence text in a given token span
   * @param span
   */
  public static String getTextContent(CoreMap sent, Span span) {
    List<CoreLabel> tokens = sent.get(CoreAnnotations.TokensAnnotation.class);
    StringBuffer buf = new StringBuffer();
    assert(span != null);
    for(int i = span.start(); i < span.end(); i ++){
      if(i > span.start()) buf.append(" ");
      buf.append(tokens.get(i).word());
    }
    return buf.toString();
  }

  public static String sentenceToString(CoreMap sent) {
    StringBuilder sb = new StringBuilder(512);
    List<CoreLabel> tokens = sent.get(CoreAnnotations.TokensAnnotation.class);
    sb.append("\"" + StringUtils.join(tokens, " ") + "\"");
    sb.append("\n");

    List<RelationMention> relationMentions = sent.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
    if(relationMentions != null){
      for (RelationMention rel : relationMentions) {
        sb.append("\n");
        sb.append(rel);
      }
    }

    // TODO: add entity and event mentions

    return sb.toString();
  }

  public static String tokensAndNELabelsToString(CoreMap sentence) {
    StringBuffer os = new StringBuffer();
    List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
    if(tokens != null){
      boolean first = true;
      for(CoreLabel token: tokens) {
        if(! first) os.append(" ");
        os.append(token.word());
        if(token.ner() != null && ! token.ner().equals("O")){
          os.append("/" + token.ner());
        }
        first = false;
      }
    }
    return os.toString();
  }

  public static String datasetToString(CoreMap dataset){
    List<CoreMap> sents = dataset.get(CoreAnnotations.SentencesAnnotation.class);
    StringBuffer b = new StringBuffer();
    if(sents != null){
      for(CoreMap sent: sents){
        b.append(sentenceToString(sent));
      }
    }
    return b.toString();
  }

  /*
  public static List<CoreLabel> wordsToCoreLabels(List<Word> words) {
    List<CoreLabel> labels = new ArrayList<CoreLabel>();
    for(Word word: words){
      CoreLabel l = new CoreLabel();
      l.setWord(word.word());
      l.set(CoreAnnotations.TextAnnotation.class, word.word());
      l.setBeginPosition(word.beginPosition());
      l.setEndPosition(word.endPosition());
      labels.add(l);
    }
    return labels;
  }
  */

  public static String tokensToString(List<CoreLabel> tokens) {
    StringBuffer os = new StringBuffer();
    boolean first = true;
    for(CoreLabel t: tokens){
      if(! first) os.append(" ");
      os.append(t.word() + "{" + t.beginPosition() + ", " + t.endPosition() + "}");
      first = false;
    }
    return os.toString();
  }

  /*
  public static boolean sentenceContainsSpan(CoreMap sentence, Span span) {
    List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
    int sentenceStart = tokens.get(0).beginPosition();
    int sentenceEnd = tokens.get(tokens.size() - 1).endPosition();
    return sentenceStart <= span.start() && sentenceEnd >= span.end();
  }
  */

  /*
   * Shift the character offsets of all tokens by offset.
   */
  public static void updateOffsets(List<Word> tokens, int offset) {
    for(Word l: tokens) {
      l.setBeginPosition(l.beginPosition() + offset);
      l.setEndPosition(l.endPosition() + offset);
    }
  }

  /*
   * Shift the character offsets of all tokens by offset.
   */
  public static void updateOffsetsInCoreLabels(List<CoreLabel> tokens, int offset) {
    for(CoreLabel l: tokens) {
      l.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, l.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) + offset);
      l.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, l.get(CoreAnnotations.CharacterOffsetEndAnnotation.class) + offset);
    }
  }

  /**
   * Process string to be a cell in Excel file.
   * Escape any quotes in the string and enclose the whole string with quotes.
   */
  public static String excelify(String s) {
    return '"'+s.replace("\"","\"\"")+'"';
  }

  public static List<CoreMap> readSentencesFromFile(String path) throws IOException, ClassNotFoundException {
    Annotation doc = (Annotation) IOUtils.readObjectFromFile(path);
    return doc.get(CoreAnnotations.SentencesAnnotation.class);

  }

}