RothCONLL04Reader.java example

Explorer
Stanford-NLP-master
- CoreNLP-master
package edu.stanford.nlp.ie.machinereading.domains.roth;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.logging.Level;
import java.util.logging.Logger;

import edu.stanford.nlp.ie.machinereading.GenericDataSetReader;
import edu.stanford.nlp.ie.machinereading.structure.AnnotationUtils;
import edu.stanford.nlp.ie.machinereading.structure.EntityMention;
import edu.stanford.nlp.ie.machinereading.structure.ExtractionObject;
import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations;
import edu.stanford.nlp.ie.machinereading.structure.RelationMention;
import edu.stanford.nlp.ie.machinereading.structure.Span;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.StringUtils;

/**
 * A Reader designed for the relation extraction data studied in Dan Roth and Wen-tau Yih,
 * A Linear Programming Formulation for Global Inference in Natural Language Tasks. CoNLL 2004.
 * The format is a somewhat ad-hoc tab-separated value file format.
 *
 * @author Mihai, David McClosky, and agusev
 * @author Sonal Gupta (sonalg@stanford.edu)
 */
public class RothCONLL04Reader extends GenericDataSetReader {

  public RothCONLL04Reader() {
    super(null, true, true, true);

    // change the logger to one from our namespace
    logger = Logger.getLogger(RothCONLL04Reader.class.getName());
    // run quietly by default
    logger.setLevel(Level.SEVERE);
  }

  @Override
  public Annotation read(String path) throws IOException {
    Annotation doc = new Annotation("");

    logger.info("Reading file: " + path);

    // Each iteration through this loop processes a single sentence along with any relations in it
    for (Iterator<String> lineIterator = IOUtils.readLines(path).iterator(); lineIterator.hasNext(); ) {
      Annotation sentence = readSentence(path, lineIterator);
      AnnotationUtils.addSentence(doc, sentence);
    }

    return doc;
  }

  private boolean warnedNER; // = false;

  private String getNormalizedNERTag(String ner) {
    if (ner.equalsIgnoreCase("O")) {
      return "O";
    } else if (ner.equalsIgnoreCase("Peop")) {
      return "PERSON";
    } else if (ner.equalsIgnoreCase("Loc")) {
      return "LOCATION";
    } else if(ner.equalsIgnoreCase("Org")) {
      return "ORGANIZATION";
    } else if(ner.equalsIgnoreCase("Other")) {
      return "OTHER";
    } else {
      if ( ! warnedNER) {
        warnedNER = true;
        logger.warning("This file contains NER tags not in the original Roth/Yih dataset, e.g.: " + ner);
      }
    }
    throw new RuntimeException("Cannot normalize ner tag " + ner);
  }

  private Annotation readSentence(String docId, Iterator<String> lineIterator) {
    Annotation sentence = new Annotation("");
    sentence.set(CoreAnnotations.DocIDAnnotation.class, docId);
    sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, new ArrayList<>());
    // we'll need to set things like the tokens and textContent after we've
    // fully read the sentence

    // contains the full text that we've read so far
    StringBuilder textContent = new StringBuilder();
    int tokenCount = 0; // how many tokens we've seen so far
    List<CoreLabel> tokens = new ArrayList<>();

    // when we've seen two blank lines in a row, this sentence is over (one
    // blank line separates the sentence and the relations
    int numBlankLinesSeen = 0;
    String sentenceID = null;

    // keeps tracks of entities we've seen so far for use by relations
    Map<String, EntityMention> indexToEntityMention = new HashMap<>();

    while (lineIterator.hasNext() && numBlankLinesSeen < 2) {
      String currentLine = lineIterator.next();
      currentLine = currentLine.replace("COMMA", ",");

      List<String> pieces = StringUtils.split(currentLine);
      String identifier;

      int size = pieces.size();
      switch (size) {
      case 1: // blank line between sentences or relations
        numBlankLinesSeen++;
        break;
      case 3: // relation
        String type = pieces.get(2);
        List<ExtractionObject> args = new ArrayList<>();
        EntityMention entity1 = indexToEntityMention.get(pieces.get(0));
        EntityMention entity2 = indexToEntityMention.get(pieces.get(1));
        args.add(entity1);
        args.add(entity2);
        Span span = new Span(entity1.getExtentTokenStart(), entity2
            .getExtentTokenEnd());
        // identifier = "relation" + sentenceID + "-" + sentence.getAllRelations().size();
        identifier = RelationMention.makeUniqueId();
        RelationMention relationMention = new RelationMention(identifier,
            sentence, span, type, null, args);
        AnnotationUtils.addRelationMention(sentence, relationMention);
        break;
      case 9: // token
        /*
         * Roth token lines look like this:
         *
         * 19 Peop 9 O NNP/NNP Jamal/Ghosheh O O O
         */

        // Entities may be multiple words joined by '/'; we split these up
        List<String> words = StringUtils.split(pieces.get(5), "/");
        //List<String> postags = StringUtils.split(pieces.get(4),"/");

        String text = StringUtils.join(words, " ");
        identifier = "entity" + pieces.get(0) + '-' + pieces.get(2);
        String nerTag = getNormalizedNERTag(pieces.get(1)); // entity type of the word/expression

        if (sentenceID == null)
          sentenceID = pieces.get(0);

        if (!nerTag.equals("O")) {
          Span extentSpan = new Span(tokenCount, tokenCount + words.size());
          // Temporarily sets the head span to equal the extent span.
          // This is so the entity has a head (in particular, getValue() works) even if preprocessSentences isn't called.
          // The head span is later modified if preprocessSentences is called.
          EntityMention entity = new EntityMention(identifier, sentence,
              extentSpan, extentSpan, nerTag, null, null);
          AnnotationUtils.addEntityMention(sentence, entity);

          // we can get by using these indices as strings since we only use them
          // as a hash key
          String index = pieces.get(2);
          indexToEntityMention.put(index, entity);
        }

        // int i =0;
        for (String word : words) {
          CoreLabel label = new CoreLabel();
          label.setWord(word);
          //label.setTag(postags.get(i));
          label.set(CoreAnnotations.TextAnnotation.class, word);
          label.set(CoreAnnotations.ValueAnnotation.class, word);
          // we don't set TokenBeginAnnotation or TokenEndAnnotation since we're
          // not keeping track of character offsets
          tokens.add(label);
          // i++;
        }

        textContent.append(text);
        textContent.append(' ');
        tokenCount += words.size();
        break;
      }
    }

    sentence.set(CoreAnnotations.TextAnnotation.class, textContent.toString());
    sentence.set(CoreAnnotations.ValueAnnotation.class, textContent.toString());
    sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
    sentence.set(CoreAnnotations.SentenceIDAnnotation.class, sentenceID);

    return sentence;
  }

  /*
   * Gets the index of an object in a list using == to test (List.indexOf uses
   * equals() which could be problematic here)
   */
  private static <X> int getIndexByObjectEquality(List<X> list, X obj) {
    for (int i = 0, sz = list.size(); i < sz; i++) {
      if (list.get(i) == obj) {
        return i;
      }
    }
    return -1;
  }

  /*
   * Sets the head word and the index for an entity, given the parse tree for
   * the sentence containing the entity.
   *
   * This code is no longer used, but I've kept it around (at least for now) as
   * reference when we modify preProcessSentences().
   */
  @SuppressWarnings("unused")
  private void setHeadWord(EntityMention entity, Tree tree) {
    List<Tree> leaves = tree.getLeaves();
    Tree argRoot = tree.joinNode(leaves.get(entity.getExtentTokenStart()),
        leaves.get(entity.getExtentTokenEnd()));
    Tree headWordNode = argRoot.headTerminal(headFinder);

    int headWordIndex = getIndexByObjectEquality(leaves, headWordNode);

    if (StringUtils.isPunct(leaves.get(entity.getExtentTokenEnd()).label().value().trim())
        && (headWordIndex >= entity.getExtentTokenEnd()
            || headWordIndex < entity.getExtentTokenStart())) {

      argRoot = tree.joinNode(leaves.get(entity.getExtentTokenStart()), leaves
          .get(entity.getExtentTokenEnd() - 1));
      headWordNode = argRoot.headTerminal(headFinder);
      headWordIndex = getIndexByObjectEquality(leaves, headWordNode);

      if (headWordIndex >= entity.getExtentTokenStart()
          && headWordIndex <= entity.getExtentTokenEnd() - 1) {
        entity.setHeadTokenPosition(headWordIndex);
        entity.setHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1));
      }
    }

    if (headWordIndex >= entity.getExtentTokenStart()
        && headWordIndex <= entity.getExtentTokenEnd()) {
      entity.setHeadTokenPosition(headWordIndex);
      entity.setHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1));
    } else {
      // Re-parse the argument words by themselves
      // Get the list of words in the arg by looking at the leaves between
      // arg.getExtentTokenStart() and arg.getExtentTokenEnd() inclusive
      List<String> argWords = new ArrayList<>();
      for (int i = entity.getExtentTokenStart(); i <= entity.getExtentTokenEnd(); i++) {
        argWords.add(leaves.get(i).label().value());
      }
      if (StringUtils.isPunct(argWords.get(argWords.size() - 1))) {
        argWords.remove(argWords.size() - 1);
      }
      Tree argTree = parseStrings(argWords);
      headWordNode = argTree.headTerminal(headFinder);
      headWordIndex = getIndexByObjectEquality(argTree.getLeaves(),
          headWordNode)
          + entity.getExtentTokenStart();
      entity.setHeadTokenPosition(headWordIndex);
      entity.setHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1));
    }
  }

  public static void main(String[] args) throws Exception {
    // just a simple test, to make sure stuff works
    Properties props = StringUtils.argsToProperties(args);
    RothCONLL04Reader reader = new RothCONLL04Reader();
    reader.setLoggerLevel(Level.INFO);
    reader.setProcessor(new StanfordCoreNLP(props));
    Annotation doc = reader.parse("/u/nlp/data/RothCONLL04/conll04.corp");
    System.out.println(AnnotationUtils.datasetToString(doc));
  }

}