AceReader.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.ie.machinereading.domains.ace; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

import javax.xml.parsers.ParserConfigurationException;

import org.xml.sax.SAXException;

import edu.stanford.nlp.ie.machinereading.GenericDataSetReader;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceCharSeq;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceDocument;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntity;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEventMention;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceRelationMention;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceRelationMentionArgument;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceToken;
import edu.stanford.nlp.ie.machinereading.structure.AnnotationUtils;
import edu.stanford.nlp.ie.machinereading.structure.EntityMention;
import edu.stanford.nlp.ie.machinereading.structure.EventMention;
import edu.stanford.nlp.ie.machinereading.structure.ExtractionObject;
import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations;
import edu.stanford.nlp.ie.machinereading.structure.RelationMention;
import edu.stanford.nlp.ie.machinereading.structure.Span;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;

/**
 *
 * Simple wrapper of Mihai's ACE code to ie.machinereading.structure objects.
 *
 * @author David McClosky
 *
 */
public class AceReader extends GenericDataSetReader  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(AceReader.class);

  private final Counter<String> entityCounts;
  private final Counter<String> adjacentEntityMentions;
  private final Counter<String> relationCounts;
  private final Counter<String> nameRelationCounts;
  private final Counter<String> eventCounts;
  private final Counter<String> mentionTypeCounts;
  private final String aceVersion;
  private static final boolean VERBOSE = false;

  /**
   * Make an AceReader.
   */
  public AceReader() {
    this(null, true);
  }

  public AceReader(StanfordCoreNLP processor, boolean preprocess) {
    this(processor, preprocess, "ACE2005");
  }

  public AceReader(StanfordCoreNLP processor, boolean preprocess, String version) {
    super(processor, preprocess, false, true);

    entityCounts = new ClassicCounter<>();
    adjacentEntityMentions = new ClassicCounter<>();
    nameRelationCounts = new ClassicCounter<>();
    relationCounts = new ClassicCounter<>();
    eventCounts = new ClassicCounter<>();
    mentionTypeCounts = new ClassicCounter<>();

    logger = Logger.getLogger(AceReader.class.getName());
    // run quietly by default
    logger.setLevel(Level.SEVERE);

    aceVersion = version;
  }

  /**
   * Reads in ACE*.apf.xml files and converts them to RelationSentence objects.
   * Note that you probably should call parse() instead.
   *
   * Currently, this ignores document boundaries (the list returned will include
   * sentences from all documents).
   *
   * @param path directory containing ACE files to read (e.g.
   *          "/home/mcclosky/scr/data/ACE2005/english_test"). This can also be
   *          the path to a single file. *
   * @return list of RelationSentence objects
   */
  @Override
  public Annotation read(String path) throws IOException, SAXException, ParserConfigurationException {
    List<CoreMap> allSentences = new ArrayList<>();
    File basePath = new File(path);
    assert basePath.exists();
    Annotation corpus = new Annotation("");

    if (basePath.isDirectory()) {
      for (File aceFile : IOUtils.iterFilesRecursive(basePath, ".apf.xml")) {
        if (aceFile.getName().endsWith(".UPC1.apf.xml")) {
          continue;
        }
        allSentences.addAll(readDocument(aceFile, corpus));
      }
    } else {
      // in case it's a file
      allSentences.addAll(readDocument(basePath, corpus));
    }

    AnnotationUtils.addSentences(corpus, allSentences);

    // quick stats
    if (VERBOSE) {
      printCounter(entityCounts, "entity mention");
      printCounter(relationCounts, "relation mention");
      printCounter(eventCounts, "event mention");
    }


    for(CoreMap sent: allSentences){
      // check for entity mentions of the same type that are adjacent
      countAdjacentMentions(sent);
      // count relations between two proper nouns
      countNameRelations(sent);
      // count types of mentions
      countMentionTypes(sent);
    }
    if (VERBOSE) {
      printCounter(adjacentEntityMentions, "adjacent entity mention");
      printCounter(nameRelationCounts, "name relation mention");
      printCounter(mentionTypeCounts, "mention type counts");
    }

    return corpus;
  }

  private void countMentionTypes(CoreMap sent) {
    List<EntityMention> mentions = sent.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
    if(mentions != null){
      for(EntityMention m: mentions){
        mentionTypeCounts.incrementCount(m.getMentionType());
      }
    }
  }

  private void countNameRelations(CoreMap sent) {
    List<RelationMention> mentions = sent.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
    if(mentions != null){
      for(RelationMention m: mentions) {
        List<EntityMention> args = m.getEntityMentionArgs();
        if(args.size() == 2 && args.get(0).getMentionType().equals("NAM") && args.get(1).getMentionType().equals("NAM")){
          nameRelationCounts.incrementCount(m.getType() + "." + m.getSubType());
        }
      }
    }
  }

  private void countAdjacentMentions(CoreMap sent) {
    List<EntityMention> mentions = sent.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
    if(mentions != null){
      for(EntityMention m1: mentions){
        for(EntityMention m2: mentions){
          if(m1 == m2) continue;
          if(m1.getHeadTokenEnd() == m2.getHeadTokenStart() && m1.getType().equals(m2.getType())){
            adjacentEntityMentions.incrementCount(m1.getType());
          }
        }
      }
    }
  }

  // todo: Change to use a counters print method (get sorting for free!)
  private void printCounter(Counter<String> c, String h) {
    StringBuilder b = new StringBuilder();
    b.append(h).append(" counts:\n");
    Set<String> keys = c.keySet();
    for(String k: keys){
      b.append("\t").append(k).append(": ").append(c.getCount(k)).append("\n");
    }
    logger.info(b.toString());
  }

   /**
   * Reads in a single ACE*.apf.xml file and convert it to RelationSentence
   * objects. However, you probably should call parse() instead.
   *
   * @param file A file object of an ACE file
   * @return list of RelationSentence objects
   */
  private List<CoreMap> readDocument(File file, Annotation corpus) throws IOException, SAXException,
      ParserConfigurationException {
    // remove the extension to make it into a prefix
    String aceFilename = file.getAbsolutePath().replace(".apf.xml", "");
    List<CoreMap> sentencesFromFile = readDocument(aceFilename, corpus);
    return sentencesFromFile;
  }

  /**
   * Reads in a single ACE*.apf.xml file and convert it to RelationSentence
   * objects. However, you probably should call parse() instead.
   *
   * @param prefix prefix of ACE filename to read (e.g.
   *          "/u/mcclosky/scr/data/ACE2005/english_test/bc/CNN_CF_20030827.1630.01"
   *          ) (no ".apf.xml" extension)
   * @return list of RelationSentence objects
   */
  private List<CoreMap> readDocument(String prefix, Annotation corpus) throws IOException, SAXException,
      ParserConfigurationException {
    logger.info("Reading document: " + prefix);
    List<CoreMap> results = new ArrayList<>();
    AceDocument aceDocument;
    if(aceVersion.equals("ACE2004")){
      aceDocument = AceDocument.parseDocument(prefix, false, aceVersion);
    } else {
      aceDocument = AceDocument.parseDocument(prefix, false);
    }
    String docId = aceDocument.getId();

    // map entity mention ID strings to their EntityMention counterparts
    Map<String, EntityMention> entityMentionMap = Generics.newHashMap();

    /*
    for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
      List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);
      StringBuffer b = new StringBuffer();
      for(AceToken t: tokens) b.append(t.getLiteral() + " " );
      logger.info("SENTENCE: " + b.toString());
    }
    */

    int tokenOffset = 0;
    for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
      List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);

      List<CoreLabel> words = new ArrayList<>();
      StringBuilder textContent = new StringBuilder();
      for(int i = 0; i < tokens.size(); i ++){
        CoreLabel l = new CoreLabel();
        l.setWord(tokens.get(i).getLiteral());
        l.set(CoreAnnotations.ValueAnnotation.class, l.word());
        l.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, tokens.get(i).getByteStart());
        l.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, tokens.get(i).getByteEnd());
        words.add(l);
        if(i > 0) textContent.append(" ");
        textContent.append(tokens.get(i).getLiteral());
      }

      // skip "sentences" that are really just SGML tags (which come from using the RobustTokenizer)
      if (words.size() == 1) {
        String word = words.get(0).word();
        if (word.startsWith("<") && word.endsWith(">")) {
          tokenOffset += tokens.size();
          continue;
        }
      }

      CoreMap sentence = new Annotation(textContent.toString());
      sentence.set(CoreAnnotations.DocIDAnnotation.class, docId);
      sentence.set(CoreAnnotations.TokensAnnotation.class, words);
      logger.info("Reading sentence: \"" + textContent + "\"");

      List<AceEntityMention> entityMentions = aceDocument.getEntityMentions(sentenceIndex);
      List<AceRelationMention> relationMentions = aceDocument.getRelationMentions(sentenceIndex);
      List<AceEventMention> eventMentions = aceDocument.getEventMentions(sentenceIndex);

      // convert entity mentions
      for (AceEntityMention aceEntityMention : entityMentions) {
        String corefID="";
        for(String entityID : aceDocument.getKeySetEntities()){
          AceEntity e = aceDocument.getEntity(entityID);
          if(e.getMentions().contains(aceEntityMention)){
            corefID = entityID;
            break;
          }
        }
        EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset, corefID);
//        EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset);
        entityCounts.incrementCount(convertedMention.getType());
        logger.info("CONVERTED MENTION HEAD SPAN: " + convertedMention.getHead());
        logger.info("CONVERTED ENTITY MENTION: " + convertedMention);
        AnnotationUtils.addEntityMention(sentence, convertedMention);
        entityMentionMap.put(aceEntityMention.getId(), convertedMention);

        // TODO: make Entity objects as needed
      }

      // convert relation mentions
      for (AceRelationMention aceRelationMention : relationMentions) {
        RelationMention convertedMention = convertAceRelationMention(aceRelationMention, docId, sentence, entityMentionMap);
        if(convertedMention != null){
          relationCounts.incrementCount(convertedMention.getType());
          logger.info("CONVERTED RELATION MENTION: " + convertedMention);
          AnnotationUtils.addRelationMention(sentence, convertedMention);
        }

        // TODO: make Relation objects
      }

      // convert EventMentions
      for(AceEventMention aceEventMention: eventMentions){
        EventMention convertedMention = convertAceEventMention(aceEventMention, docId, sentence, entityMentionMap, tokenOffset);
        if(convertedMention != null){
          eventCounts.incrementCount(convertedMention.getType());
          logger.info("CONVERTED EVENT MENTION: " + convertedMention);
          AnnotationUtils.addEventMention(sentence, convertedMention);
        }

        // TODO: make Event objects
      }

      results.add(sentence);
      tokenOffset += tokens.size();
    }
    return results;
  }

  private EventMention convertAceEventMention(
      AceEventMention aceEventMention, String docId,
      CoreMap sentence, Map<String, EntityMention> entityMap,
      int tokenOffset) {
    Set<String> roleSet = aceEventMention.getRoles();
    List<String> roles = new ArrayList<>();
    for(String role: roleSet) roles.add(role);
    List<ExtractionObject> convertedArgs = new ArrayList<>();

    int left = Integer.MAX_VALUE;
    int right = Integer.MIN_VALUE;
    for(String role: roles){
      AceEntityMention arg = aceEventMention.getArg(role);
      ExtractionObject o = entityMap.get(arg.getId());
      if(o == null){
        logger.severe("READER ERROR: Failed to find event argument with id " + arg.getId());
        logger.severe("This happens because a few event mentions illegally span multiple sentences. Will ignore this mention.");
        return null;
      }
      convertedArgs.add(o);
      if(o.getExtentTokenStart() < left) left = o.getExtentTokenStart();
      if(o.getExtentTokenEnd() > right) right = o.getExtentTokenEnd();
    }

    AceCharSeq anchor = aceEventMention.getAnchor();
    ExtractionObject anchorObject = new ExtractionObject(
        aceEventMention.getId() + "-anchor",
        sentence,
        new Span(anchor.getTokenStart() - tokenOffset, anchor.getTokenEnd() + 1 - tokenOffset),
        "ANCHOR",
        null);

    EventMention em = new EventMention(
        aceEventMention.getId(),
        sentence,
        new Span(left, right),
        aceEventMention.getParent().getType(),
        aceEventMention.getParent().getSubtype(),
        anchorObject,
        convertedArgs,
        roles);
    return em;
  }

  private RelationMention convertAceRelationMention(AceRelationMention aceRelationMention, String docId,
      CoreMap sentence, Map<String, EntityMention> entityMap) {
    List<AceRelationMentionArgument> args = Arrays.asList(aceRelationMention.getArgs());
    List<ExtractionObject> convertedArgs = new ArrayList<>();
    List<String> argNames = new ArrayList<>();

    // the arguments are already stored in semantic order. Make sure we preserve the same ordering!
    int left = Integer.MAX_VALUE;
    int right = Integer.MIN_VALUE;
    for (AceRelationMentionArgument arg : args) {
      ExtractionObject o = entityMap.get(arg.getContent().getId());
      if(o == null){
        logger.severe("READER ERROR: Failed to find relation argument with id " + arg.getContent().getId());
        logger.severe("This happens because a few relation mentions illegally span multiple sentences. Will ignore this mention.");
        return null;
      }
      convertedArgs.add(o);
      argNames.add(arg.getRole());
      if(o.getExtentTokenStart() < left) left = o.getExtentTokenStart();
      if(o.getExtentTokenEnd() > right) right = o.getExtentTokenEnd();
    }

    if(argNames.size() != 2 || ! argNames.get(0).equalsIgnoreCase("arg-1") || ! argNames.get(1).equalsIgnoreCase("arg-2")){
      logger.severe("READER ERROR: Invalid succession of arguments in relation mention: " + argNames);
      logger.severe("ACE relations must have two arguments. Will ignore this mention.");
      return null;
    }

    RelationMention relation = new RelationMention(
        aceRelationMention.getId(),
        sentence,
        new Span(left, right),
        aceRelationMention.getParent().getType(),
        aceRelationMention.getParent().getSubtype(),
        convertedArgs,
        null);
    return relation;
  }

  /**
   * Convert an {@link AceEntityMention} to an {@link EntityMention}.
   *
   * @param entityMention {@link AceEntityMention} to convert
   * @param docId ID of the document containing this entity mention
   * @param sentence
   * @param tokenOffset An offset in the calculations of position of the extent to sentence boundary
   *                    (the ace.reader stores absolute token offset from the beginning of the document, but
   *                    we need token offsets from the beginning of the sentence => adjust by tokenOffset)
   * @return entity as an {@link EntityMention}
   */
  private EntityMention convertAceEntityMention(AceEntityMention entityMention, String docId, CoreMap sentence, int tokenOffset) {
    //log.info("TYPE is " + entityMention.getParent().getType());
    //log.info("SUBTYPE is " + entityMention.getParent().getSubtype());
    //log.info("LDCTYPE is " + entityMention.getLdctype());

    AceCharSeq ext = entityMention.getExtent();
    AceCharSeq head = entityMention.getHead();

    int extStart = ext.getTokenStart() - tokenOffset;
    int extEnd = ext.getTokenEnd() - tokenOffset + 1;
    if (extStart < 0) {
      logger.severe("READER ERROR: Invalid extent start " + extStart + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence);
      logger.severe("This may happen due to incorrect EOS detection. Adjusting entity extent.");
      extStart = 0;
    }
    if (extEnd > sentence.get(CoreAnnotations.TokensAnnotation.class).size()) {
      logger.severe("READER ERROR: Invalid extent end " + extEnd + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence);
      logger.severe("This may happen due to incorrect EOS detection. Adjusting entity extent.");
      extEnd = sentence.get(CoreAnnotations.TokensAnnotation.class).size();
    }

    int headStart = head.getTokenStart() - tokenOffset;
    int headEnd = head.getTokenEnd() - tokenOffset + 1;
    if (headStart < 0) {
      logger.severe("READER ERROR: Invalid head start " + headStart + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence);
      logger.severe("This may happen due to incorrect EOS detection. Adjusting entity head span.");
      headStart = 0;
    }
    if(headEnd > sentence.get(CoreAnnotations.TokensAnnotation.class).size()){
      logger.severe("READER ERROR: Invalid head end " + headEnd + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence);
      logger.severe("This may happen due to incorrect EOS detection. Adjusting entity head span.");
      headEnd = sentence.get(CoreAnnotations.TokensAnnotation.class).size();
    }

    // must adjust due to possible incorrect EOS detection
    if(headStart < extStart){
      headStart = extStart;
    }
    if(headEnd > extEnd){
      headEnd = extEnd;
    }
    assert(headStart < headEnd);

    // note: the ace.reader stores absolute token offset from the beginning of the document, but
    //       we need token offsets from the beginning of the sentence => adjust by tokenOffset
    // note: in ace.reader the end token position is inclusive, but
    //       in our setup the end token position is exclusive => add 1 to end
    EntityMention converted = new EntityMention(
        entityMention.getId(),
        sentence,
        new Span(extStart, extEnd),
        new Span(headStart, headEnd),
        entityMention.getParent().getType(),
        entityMention.getParent().getSubtype(),
        entityMention.getLdctype());
    return converted;
  }

  private EntityMention convertAceEntityMention(AceEntityMention entityMention, String docId, CoreMap sentence, int tokenOffset, String corefID) {
    EntityMention converted = convertAceEntityMention(entityMention, docId, sentence, tokenOffset);
    converted.setCorefID(corefID);
    return converted;
  }

  // simple testing code
  public static void main(String[] args) throws IOException {
    Properties props = StringUtils.argsToProperties(args);
    AceReader r = new AceReader(new StanfordCoreNLP(props, false), false);
    r.setLoggerLevel(Level.INFO);
    r.parse("/scr/nlp/data/ACE2005/");
    // Annotation a = r.parse("/user/mengqiu/scr/twitter/nlp/corpus_prep/standalone/ar/data");
    // BasicEntityExtractor.saveCoNLLFiles("/tmp/conll", a, false, false);
    log.info("done");
  }

}