GenericDataSetReader.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.ie.machinereading; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.logging.Level;
import java.util.logging.Logger;

import edu.stanford.nlp.ie.machinereading.common.NoPunctuationHeadFinder;
import edu.stanford.nlp.ie.machinereading.structure.EntityMention;
import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations;
import edu.stanford.nlp.ie.machinereading.structure.Span;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.parser.common.ParserAnnotations;
import edu.stanford.nlp.parser.common.ParserConstraint;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.CoreMap;

/**
 *
 * @author Andrey Gusev
 * @author Mihai
 *
 */
public class GenericDataSetReader  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(GenericDataSetReader.class);
  protected Logger logger;

  /** Finds the syntactic head of a syntactic constituent */
  protected final HeadFinder headFinder = new NoPunctuationHeadFinder();

  /** NL processor to use for sentence pre-processing */
  protected StanfordCoreNLP processor;

  /**
   * Additional NL processor that implements only syntactic parsing (needed for head detection)
   * We need this processor to detect heads of predicted entities that cannot be matched to an existing constituent.
   * This is created on demand, only when necessary
   */
  protected Annotator parserProcessor;

  /** If true, we perform syntactic analysis of the dataset sentences and annotations */
  protected final boolean preProcessSentences;

  /**
   * If true, sets the head span to match the syntactic head of the extent.
   * Otherwise, the head span is not modified.
   * This is enabled for the NFL domain, where head spans are not given.
   */
  protected final boolean calculateHeadSpan;

  /** If true, it regenerates the index spans for all tree nodes (useful for KBP) */
  protected final boolean forceGenerationOfIndexSpans;

  /** Only around for legacy results */
  protected boolean useNewHeadFinder = true;

  public GenericDataSetReader() {
    this(null, false, false, false);
  }

  public GenericDataSetReader(StanfordCoreNLP processor, boolean preProcessSentences, boolean calculateHeadSpan, boolean forceGenerationOfIndexSpans) {
    this.logger = Logger.getLogger(GenericDataSetReader.class.getName());
    this.logger.setLevel(Level.SEVERE);

    if(processor != null) setProcessor(processor);
    parserProcessor = null;
    /* old parser options
    parser.setOptionFlags(new String[] {
        "-outputFormat", "penn,typedDependenciesCollapsed",
        "-maxLength", "100",
        "-retainTmpSubcategories"
    });
    */

    this.preProcessSentences = preProcessSentences;
    this.calculateHeadSpan = calculateHeadSpan;
    this.forceGenerationOfIndexSpans = forceGenerationOfIndexSpans;
  }

  public void setProcessor(StanfordCoreNLP p) {
    this.processor = p;
  }

  public void setUseNewHeadFinder(boolean useNewHeadFinder) {
    this.useNewHeadFinder = useNewHeadFinder;
  }

  public Annotator getParser() {
    if(parserProcessor == null){
      parserProcessor = StanfordCoreNLP.getExistingAnnotator("parse");
      assert(parserProcessor != null);
    }
    return parserProcessor;
  }

  public void setLoggerLevel(Level level) {
    logger.setLevel(level);
  }
  public Level getLoggerLevel() {
    return logger.getLevel();
  }

  /**
   * Parses one file or directory with data from one domain
   * @param path
   * @throws IOException
   */
  public final Annotation parse(String path) throws IOException {
    Annotation retVal; // set below or exceptions

    try {
      //
      // this must return a dataset Annotation. each sentence in this dataset must contain:
      // - TokensAnnotation
      // - EntityMentionAnnotation
      // - RelationMentionAnnotation
      // - EventMentionAnnotation
      // the other annotations (parse, NER) are generated in preProcessSentences
      //
      retVal = this.read(path);
    } catch (Exception ex) {
      IOException iox = new IOException();
      iox.initCause(ex);
      throw iox;
    }

    if (preProcessSentences) {
      preProcessSentences(retVal);
      if(MachineReadingProperties.trainUsePipelineNER){
        logger.severe("Changing NER tags using the CoreNLP pipeline.");
        modifyUsingCoreNLPNER(retVal);
        }
    }
    return retVal;
  }
  
  private void modifyUsingCoreNLPNER(Annotation doc) {
    Properties ann = new Properties();
    ann.setProperty("annotators", "pos, lemma, ner");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(ann, false);
    pipeline.annotate(doc);
    for (CoreMap sentence : doc.get(CoreAnnotations.SentencesAnnotation.class)) {
      List<EntityMention> entities = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
      if (entities != null) {
        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        for (EntityMention en : entities) {
          //System.out.println("old ner tag for " + en.getExtentString() + " was " + en.getType());
          Span s = en.getExtent();
          Counter<String> allNertagforSpan = new ClassicCounter<>();
          for (int i = s.start(); i < s.end(); i++) {
            allNertagforSpan.incrementCount(tokens.get(i).ner());
          }
          String entityNertag = Counters.argmax(allNertagforSpan);
          en.setType(entityNertag);
          //System.out.println("new ner tag is " + entityNertag);
        }
      }
      
    }
  }

  public Annotation read(String path) throws Exception {
    return null;
  }

  private static String sentenceToString(List<CoreLabel> tokens) {
    StringBuilder os = new StringBuilder();

    //
    // Print text and tokens
    //
    if(tokens != null){
      boolean first = true;
      for(CoreLabel token: tokens) {
        if(! first) os.append(" ");
        os.append(token.word());
        first = false;
      }
    }

    return os.toString();
  }


  /**
   * Find the index of the head of an entity.
   *
   * @param ent The entity mention
   * @param tree The Tree for the entire sentence in which it occurs.
   * @param tokens The Sentence in which it occurs
   * @param setHeadSpan Whether to set the head span in the entity mention.
   * @return The index of the entity head
   */
  public int assignSyntacticHead(EntityMention ent, Tree tree, List<CoreLabel> tokens, boolean setHeadSpan) {
    if (ent.getSyntacticHeadTokenPosition() != -1) {
      return ent.getSyntacticHeadTokenPosition();
    }

    logger.finest("Finding syntactic head for entity: " + ent + " in tree: " + tree.toString());
    logger.finest("Flat sentence is: " + tokens);
    Tree sh = null;
    try {
      sh = findSyntacticHead(ent, tree, tokens);
    } catch(Exception e) {
      logger.severe("WARNING: failed to parse sentence. Will continue with the right-most head heuristic: " + sentenceToString(tokens));
      e.printStackTrace();
    } catch(AssertionError e) {
      logger.severe("WARNING: failed to parse sentence. Will continue with the right-most head heuristic: " + sentenceToString(tokens));
      e.printStackTrace();
    }

    int headPos = ent.getExtentTokenEnd() - 1;
    if(sh != null){
      CoreLabel label = (CoreLabel) sh.label();
      headPos = label.get(CoreAnnotations.BeginIndexAnnotation.class);
    } else {
      logger.fine("WARNING: failed to find syntactic head for entity: " + ent + " in tree: " + tree);
      logger.fine("Fallback strategy: will set head to last token in mention: " + tokens.get(headPos));
    }
    ent.setHeadTokenPosition(headPos);

    if (setHeadSpan){
      // set the head span to match exactly the syntactic head
      // this is needed for some corpora where the head span is not given
      ent.setHeadTokenSpan(new Span(headPos, headPos + 1));
    }

    return headPos;
  }

  /**
   * Take a dataset Annotation, generate their parse trees and identify syntactic heads (and head spans, if necessary)
   */
  public void preProcessSentences(Annotation dataset) {
    logger.severe("GenericDataSetReader: Started pre-processing the corpus...");
    // run the processor, i.e., NER, parse etc.
    if (processor != null) {
      // we might already have syntactic annotation from offline files
      List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class);
      if (sentences.size() > 0 && !sentences.get(0).containsKey(TreeCoreAnnotations.TreeAnnotation.class)) {
        logger.info("Annotating dataset with " + processor);
        processor.annotate(dataset);
      } else {
        logger.info("Found existing syntactic annotations. Will not use the NLP processor.");
      }
    }
    /*
    List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class);
    for(int i = 0; i < sentences.size(); i ++){
      CoreMap sent = sentences.get(i);
      List<CoreLabel> tokens = sent.get(CoreAnnotations.TokensAnnotation.class);
      logger.info("Tokens for sentence #" + i + ": " + tokens);
      logger.info("Parse tree for sentence #" + i + ": " + sent.get(TreeCoreAnnotations.TreeAnnotation.class).pennString());
    }
    */

    List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class);
    logger.fine("Extracted " + sentences.size() + " sentences.");
    for (CoreMap sentence : sentences) {
      List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
      logger.fine("Processing sentence " + tokens);
      Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
      if(tree == null) throw new RuntimeException("ERROR: MR requires full syntactic analysis!");

      // convert tree labels to CoreLabel if necessary
      // we need this because we store additional info in the CoreLabel, such as the spans of each tree
      convertToCoreLabels(tree);

      // store the tree spans, if not present already
      CoreLabel l = (CoreLabel) tree.label();
      if(forceGenerationOfIndexSpans || (! l.containsKey(CoreAnnotations.BeginIndexAnnotation.class) && ! l.containsKey(CoreAnnotations.EndIndexAnnotation.class))){
        tree.indexSpans(0);
        logger.fine("Index spans were generated.");
      } else {
        logger.fine("Index spans were NOT generated.");
      }
      logger.fine("Parse tree using CoreLabel:\n" + tree.pennString());

      //
      // now match all entity mentions against the syntactic tree
      //
      if (sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class) != null) {
        for (EntityMention ent : sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class)) {
          logger.fine("Finding head for entity: " + ent);
          int headPos = assignSyntacticHead(ent, tree, tokens, calculateHeadSpan);
          logger.fine("Syntactic head of mention \"" + ent + "\" is: " + tokens.get(headPos).word());

          assert(ent.getExtent() != null);
          assert(ent.getHead() != null);
          assert(ent.getSyntacticHeadTokenPosition() >= 0);
        }
      }
    }
    logger.severe("GenericDataSetReader: Pre-processing complete.");
  }

  /**
   * Converts the tree labels to CoreLabels.
   * We need this because we store additional info in the CoreLabel, like token span.
   * @param tree
   */
  public static void convertToCoreLabels(Tree tree) {
    Label l = tree.label();
    if(! (l instanceof CoreLabel)){
      CoreLabel cl = new CoreLabel();
      cl.setValue(l.value());
      tree.setLabel(cl);
    }

    for (Tree kid : tree.children()) {
      convertToCoreLabels(kid);
    }
  }

  private static String printTree(Tree tree) {
    StringBuilder sb = new StringBuilder();
    return tree.toStringBuilder(sb, true).toString();
  }

  private Tree safeHead(Tree top) {
    Tree head = top.headTerminal(headFinder);
    if (head != null) return head;
    // if no head found return the right-most leaf
    List<Tree> leaves = top.getLeaves();
    if(leaves.size() > 0) return leaves.get(leaves.size() - 1);
    // fallback: return top
    return top;
  }

  /**
   * Finds the syntactic head of the given entity mention.
   *
   * @param ent The entity mention
   * @param root The Tree for the entire sentence in which it occurs.
   * @param tokens The Sentence in which it occurs
   * @return The tree object corresponding to the head. This MUST be a child of root.
   *     It will be a leaf in the parse tree.
   */
  public Tree findSyntacticHead(EntityMention ent, Tree root, List<CoreLabel> tokens) {
    if (!useNewHeadFinder) {
      return originalFindSyntacticHead(ent, root, tokens);
    }

    logger.fine("Searching for tree matching " + ent);
    Tree exactMatch = findTreeWithSpan(root, ent.getExtentTokenStart(), ent.getExtentTokenEnd());

    //
    // found an exact match
    //
    if (exactMatch != null) {
      logger.fine("Mention \"" + ent + "\" mapped to tree: " + printTree(exactMatch));
      return safeHead(exactMatch);
    }

    // no exact match found
    // in this case, we parse the actual extent of the mention, embedded in a sentence
    // context, so as to make the parser work better :-)

    int approximateness = 0;
    List<CoreLabel> extentTokens = new ArrayList<>();
    extentTokens.add(initCoreLabel("It"));
    extentTokens.add(initCoreLabel("was"));
    final int ADDED_WORDS = 2;
    for (int i = ent.getExtentTokenStart(); i < ent.getExtentTokenEnd(); i++) {
      // Add everything except separated dashes! The separated dashes mess with the parser too badly.
      CoreLabel label = tokens.get(i);
      if ( ! "-".equals(label.word())) {
        extentTokens.add(tokens.get(i));
      } else {
        approximateness++;
      }
    }
    extentTokens.add(initCoreLabel("."));

    // constrain the parse to the part we're interested in.
    // Starting from ADDED_WORDS comes from skipping "It was".
    // -1 to exclude the period.
    // We now let it be any kind of nominal constituent, since there
    // are VP and S ones
    ParserConstraint constraint = new ParserConstraint(ADDED_WORDS, extentTokens.size() - 1, ".*");
    List<ParserConstraint> constraints = Collections.singletonList(constraint);
    Tree tree = parse(extentTokens, constraints);
    logger.fine("No exact match found. Local parse:\n" + tree.pennString());
    convertToCoreLabels(tree);
    tree.indexSpans(ent.getExtentTokenStart() - ADDED_WORDS);  // remember it has ADDED_WORDS extra words at the beginning
    Tree subtree = findPartialSpan(tree, ent.getExtentTokenStart());
    Tree extentHead = safeHead(subtree);
    logger.fine("Head is: " + extentHead);
    assert(extentHead != null);
    // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree
    // Because we deleted dashes, it's index will be >= the index in the extent parse tree
    CoreLabel l = (CoreLabel) extentHead.label();
    // Tree realHead = findTreeWithSpan(root, l.get(CoreAnnotations.BeginIndexAnnotation.class), l.get(CoreAnnotations.EndIndexAnnotation.class));
    Tree realHead = funkyFindLeafWithApproximateSpan(root, l.value(), l.get(CoreAnnotations.BeginIndexAnnotation.class), approximateness);
    if(realHead != null) logger.fine("Chosen head: " + realHead);
    return realHead;
  }

  private Tree findPartialSpan(Tree current, int start) {
    CoreLabel label = (CoreLabel) current.label();
    int startIndex = label.get(CoreAnnotations.BeginIndexAnnotation.class);
    if (startIndex == start) {
      logger.fine("findPartialSpan: Returning " + current);
      return current;
    }
    for (Tree kid : current.children()) {
      CoreLabel kidLabel = (CoreLabel) kid.label();
      int kidStart = kidLabel.get(CoreAnnotations.BeginIndexAnnotation.class);
      int kidEnd = kidLabel.get(CoreAnnotations.EndIndexAnnotation.class);
      // log.info("findPartialSpan: Examining " + kidLabel.value() + " from " + kidStart + " to " + kidEnd);
      if (kidStart <= start && kidEnd > start) {
        return findPartialSpan(kid, start);
      }
    }
    throw new RuntimeException("Shouldn't happen: " + start + " " + current);
  }

  private Tree funkyFindLeafWithApproximateSpan(Tree root, String token, int index, int approximateness) {
    logger.fine("Looking for " + token + " at pos " + index + " plus upto " + approximateness + " in tree: " + root.pennString());
    List<Tree> leaves = root.getLeaves();
    for (Tree leaf : leaves) {
      CoreLabel label = CoreLabel.class.cast(leaf.label());
      int ind = label.get(CoreAnnotations.BeginIndexAnnotation.class);
      // log.info("Token #" + ind + ": " + leaf.value());
      if (token.equals(leaf.value()) && ind >= index && ind <= index + approximateness) {
        return leaf;
      }
    }
    // this shouldn't happen
    // but it does happen (VERY RARELY) on some weird web text that includes SGML tags with spaces
    // TODO: does this mean that somehow tokenization is different for the parser? check this by throwing an Exception in KBP
    logger.severe("GenericDataSetReader: WARNING: Failed to find head token");
    logger.severe("  when looking for " + token + " at pos " + index + " plus upto " + approximateness + " in tree: " + root.pennString());
    return null;
  }

  /**
   * This is the original version of {@link #findSyntacticHead} before Chris's modifications.
   * There's no good reason to use it except for producing historical results.
   * It Finds the syntactic head of the given entity mention.
   *
   * @param ent The entity mention
   * @param root The Tree for the entire sentence in which it occurs.
   * @param tokens The Sentence in which it occurs
   * @return The tree object corresponding to the head. This MUST be a child of root.
   *     It will be a leaf in the parse tree.
   */
  public Tree originalFindSyntacticHead(EntityMention ent, Tree root, List<CoreLabel> tokens) {
    logger.fine("Searching for tree matching " + ent);
    Tree exactMatch = findTreeWithSpan(root, ent.getExtentTokenStart(), ent.getExtentTokenEnd());

    //
    // found an exact match
    //
    if (exactMatch != null) {
      logger.fine("Mention \"" + ent + "\" mapped to tree: " + printTree(exactMatch));
      return safeHead(exactMatch);
    }

    //
    // no exact match found
    // in this case, we parse the actual extent of the mention
    //
    List<CoreLabel> extentTokens = new ArrayList<>();
    for (int i = ent.getExtentTokenStart(); i < ent.getExtentTokenEnd(); i++)
      extentTokens.add(tokens.get(i));

    Tree tree = parse(extentTokens);
    logger.fine("No exact match found. Local parse:\n" + tree.pennString());
    convertToCoreLabels(tree);
    tree.indexSpans(ent.getExtentTokenStart());
    Tree extentHead = safeHead(tree);
    assert (extentHead != null);
    // extentHead is a child in the local extent parse tree. we need to find the
    // corresponding node in the main tree
    CoreLabel l = (CoreLabel) extentHead.label();
    Tree realHead = findTreeWithSpan(root, l.get(CoreAnnotations.BeginIndexAnnotation.class), l.get(CoreAnnotations.EndIndexAnnotation.class));
    assert (realHead != null);

    return realHead;
  }

  private static CoreLabel initCoreLabel(String token) {
    CoreLabel label = new CoreLabel();
    label.setWord(token);
    label.setValue(token);
    label.set(CoreAnnotations.TextAnnotation.class, token);
    label.set(CoreAnnotations.ValueAnnotation.class, token);
    
    return label;
  }

  protected Tree parseStrings(List<String> tokens) {
    List<CoreLabel> labels = new ArrayList<>();
    for (String t : tokens) {
      CoreLabel l = initCoreLabel(t);
      labels.add(l);
    }
    return parse(labels);
  }

  protected Tree parse(List<CoreLabel> tokens) {
    return parse(tokens, null);
  }

  protected Tree parse(List<CoreLabel> tokens,
                       List<ParserConstraint> constraints) {
    CoreMap sent = new Annotation("");
    sent.set(CoreAnnotations.TokensAnnotation.class, tokens);
    sent.set(ParserAnnotations.ConstraintAnnotation.class, constraints);
    Annotation doc = new Annotation("");
    List<CoreMap> sents = new ArrayList<>();
    sents.add(sent);
    doc.set(CoreAnnotations.SentencesAnnotation.class, sents);
    getParser().annotate(doc);
    sents = doc.get(CoreAnnotations.SentencesAnnotation.class);
    return sents.get(0).get(TreeCoreAnnotations.TreeAnnotation.class);
  }

  /**
   * Finds the tree with the given token span.
   * The tree must have CoreLabel labels and Tree.indexSpans must be called before this method.
   *
   * @param tree The tree to search in
   * @param start The beginning index
   * @param end
   * @return A child of tree if match; otherwise null
   */
  private static Tree findTreeWithSpan(Tree tree, int start, int end) {
    CoreLabel l = (CoreLabel) tree.label();
    if (l != null && l.containsKey(CoreAnnotations.BeginIndexAnnotation.class) && l.containsKey(CoreAnnotations.EndIndexAnnotation.class)) {
      int myStart = l.get(CoreAnnotations.BeginIndexAnnotation.class);
      int myEnd = l.get(CoreAnnotations.EndIndexAnnotation.class);
      if (start == myStart && end == myEnd){
        // found perfect match
        return tree;
      } else if (end < myStart) {
        return null;
      } else if (start >= myEnd) {
        return null;
      }
    }

    // otherwise, check inside children - a match is possible
    for (Tree kid : tree.children()) {
      if (kid == null) continue;
      Tree ret = findTreeWithSpan(kid, start, end);
      // found matching child
      if (ret != null) return ret;
    }

    // no match
    return null;
  }

}