package edu.stanford.nlp.ie.machinereading.domains.ace; import edu.stanford.nlp.util.logging.Redwood; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import javax.xml.parsers.ParserConfigurationException; import org.xml.sax.SAXException; import edu.stanford.nlp.ie.machinereading.GenericDataSetReader; import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceCharSeq; import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceDocument; import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntity; import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention; import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEventMention; import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceRelationMention; import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceRelationMentionArgument; import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceToken; import edu.stanford.nlp.ie.machinereading.structure.AnnotationUtils; import edu.stanford.nlp.ie.machinereading.structure.EntityMention; import edu.stanford.nlp.ie.machinereading.structure.EventMention; import edu.stanford.nlp.ie.machinereading.structure.ExtractionObject; import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations; import edu.stanford.nlp.ie.machinereading.structure.RelationMention; import edu.stanford.nlp.ie.machinereading.structure.Span; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.StringUtils; /** * * Simple wrapper of Mihai's ACE code to ie.machinereading.structure objects. * * @author David McClosky * */ public class AceReader extends GenericDataSetReader { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(AceReader.class); private final Counter<String> entityCounts; private final Counter<String> adjacentEntityMentions; private final Counter<String> relationCounts; private final Counter<String> nameRelationCounts; private final Counter<String> eventCounts; private final Counter<String> mentionTypeCounts; private final String aceVersion; private static final boolean VERBOSE = false; /** * Make an AceReader. */ public AceReader() { this(null, true); } public AceReader(StanfordCoreNLP processor, boolean preprocess) { this(processor, preprocess, "ACE2005"); } public AceReader(StanfordCoreNLP processor, boolean preprocess, String version) { super(processor, preprocess, false, true); entityCounts = new ClassicCounter<>(); adjacentEntityMentions = new ClassicCounter<>(); nameRelationCounts = new ClassicCounter<>(); relationCounts = new ClassicCounter<>(); eventCounts = new ClassicCounter<>(); mentionTypeCounts = new ClassicCounter<>(); logger = Logger.getLogger(AceReader.class.getName()); // run quietly by default logger.setLevel(Level.SEVERE); aceVersion = version; } /** * Reads in ACE*.apf.xml files and converts them to RelationSentence objects. * Note that you probably should call parse() instead. * * Currently, this ignores document boundaries (the list returned will include * sentences from all documents). * * @param path directory containing ACE files to read (e.g. * "/home/mcclosky/scr/data/ACE2005/english_test"). This can also be * the path to a single file. * * @return list of RelationSentence objects */ @Override public Annotation read(String path) throws IOException, SAXException, ParserConfigurationException { List<CoreMap> allSentences = new ArrayList<>(); File basePath = new File(path); assert basePath.exists(); Annotation corpus = new Annotation(""); if (basePath.isDirectory()) { for (File aceFile : IOUtils.iterFilesRecursive(basePath, ".apf.xml")) { if (aceFile.getName().endsWith(".UPC1.apf.xml")) { continue; } allSentences.addAll(readDocument(aceFile, corpus)); } } else { // in case it's a file allSentences.addAll(readDocument(basePath, corpus)); } AnnotationUtils.addSentences(corpus, allSentences); // quick stats if (VERBOSE) { printCounter(entityCounts, "entity mention"); printCounter(relationCounts, "relation mention"); printCounter(eventCounts, "event mention"); } for(CoreMap sent: allSentences){ // check for entity mentions of the same type that are adjacent countAdjacentMentions(sent); // count relations between two proper nouns countNameRelations(sent); // count types of mentions countMentionTypes(sent); } if (VERBOSE) { printCounter(adjacentEntityMentions, "adjacent entity mention"); printCounter(nameRelationCounts, "name relation mention"); printCounter(mentionTypeCounts, "mention type counts"); } return corpus; } private void countMentionTypes(CoreMap sent) { List<EntityMention> mentions = sent.get(MachineReadingAnnotations.EntityMentionsAnnotation.class); if(mentions != null){ for(EntityMention m: mentions){ mentionTypeCounts.incrementCount(m.getMentionType()); } } } private void countNameRelations(CoreMap sent) { List<RelationMention> mentions = sent.get(MachineReadingAnnotations.RelationMentionsAnnotation.class); if(mentions != null){ for(RelationMention m: mentions) { List<EntityMention> args = m.getEntityMentionArgs(); if(args.size() == 2 && args.get(0).getMentionType().equals("NAM") && args.get(1).getMentionType().equals("NAM")){ nameRelationCounts.incrementCount(m.getType() + "." + m.getSubType()); } } } } private void countAdjacentMentions(CoreMap sent) { List<EntityMention> mentions = sent.get(MachineReadingAnnotations.EntityMentionsAnnotation.class); if(mentions != null){ for(EntityMention m1: mentions){ for(EntityMention m2: mentions){ if(m1 == m2) continue; if(m1.getHeadTokenEnd() == m2.getHeadTokenStart() && m1.getType().equals(m2.getType())){ adjacentEntityMentions.incrementCount(m1.getType()); } } } } } // todo: Change to use a counters print method (get sorting for free!) private void printCounter(Counter<String> c, String h) { StringBuilder b = new StringBuilder(); b.append(h).append(" counts:\n"); Set<String> keys = c.keySet(); for(String k: keys){ b.append("\t").append(k).append(": ").append(c.getCount(k)).append("\n"); } logger.info(b.toString()); } /** * Reads in a single ACE*.apf.xml file and convert it to RelationSentence * objects. However, you probably should call parse() instead. * * @param file A file object of an ACE file * @return list of RelationSentence objects */ private List<CoreMap> readDocument(File file, Annotation corpus) throws IOException, SAXException, ParserConfigurationException { // remove the extension to make it into a prefix String aceFilename = file.getAbsolutePath().replace(".apf.xml", ""); List<CoreMap> sentencesFromFile = readDocument(aceFilename, corpus); return sentencesFromFile; } /** * Reads in a single ACE*.apf.xml file and convert it to RelationSentence * objects. However, you probably should call parse() instead. * * @param prefix prefix of ACE filename to read (e.g. * "/u/mcclosky/scr/data/ACE2005/english_test/bc/CNN_CF_20030827.1630.01" * ) (no ".apf.xml" extension) * @return list of RelationSentence objects */ private List<CoreMap> readDocument(String prefix, Annotation corpus) throws IOException, SAXException, ParserConfigurationException { logger.info("Reading document: " + prefix); List<CoreMap> results = new ArrayList<>(); AceDocument aceDocument; if(aceVersion.equals("ACE2004")){ aceDocument = AceDocument.parseDocument(prefix, false, aceVersion); } else { aceDocument = AceDocument.parseDocument(prefix, false); } String docId = aceDocument.getId(); // map entity mention ID strings to their EntityMention counterparts Map<String, EntityMention> entityMentionMap = Generics.newHashMap(); /* for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) { List<AceToken> tokens = aceDocument.getSentence(sentenceIndex); StringBuffer b = new StringBuffer(); for(AceToken t: tokens) b.append(t.getLiteral() + " " ); logger.info("SENTENCE: " + b.toString()); } */ int tokenOffset = 0; for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) { List<AceToken> tokens = aceDocument.getSentence(sentenceIndex); List<CoreLabel> words = new ArrayList<>(); StringBuilder textContent = new StringBuilder(); for(int i = 0; i < tokens.size(); i ++){ CoreLabel l = new CoreLabel(); l.setWord(tokens.get(i).getLiteral()); l.set(CoreAnnotations.ValueAnnotation.class, l.word()); l.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, tokens.get(i).getByteStart()); l.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, tokens.get(i).getByteEnd()); words.add(l); if(i > 0) textContent.append(" "); textContent.append(tokens.get(i).getLiteral()); } // skip "sentences" that are really just SGML tags (which come from using the RobustTokenizer) if (words.size() == 1) { String word = words.get(0).word(); if (word.startsWith("<") && word.endsWith(">")) { tokenOffset += tokens.size(); continue; } } CoreMap sentence = new Annotation(textContent.toString()); sentence.set(CoreAnnotations.DocIDAnnotation.class, docId); sentence.set(CoreAnnotations.TokensAnnotation.class, words); logger.info("Reading sentence: \"" + textContent + "\""); List<AceEntityMention> entityMentions = aceDocument.getEntityMentions(sentenceIndex); List<AceRelationMention> relationMentions = aceDocument.getRelationMentions(sentenceIndex); List<AceEventMention> eventMentions = aceDocument.getEventMentions(sentenceIndex); // convert entity mentions for (AceEntityMention aceEntityMention : entityMentions) { String corefID=""; for(String entityID : aceDocument.getKeySetEntities()){ AceEntity e = aceDocument.getEntity(entityID); if(e.getMentions().contains(aceEntityMention)){ corefID = entityID; break; } } EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset, corefID); // EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset); entityCounts.incrementCount(convertedMention.getType()); logger.info("CONVERTED MENTION HEAD SPAN: " + convertedMention.getHead()); logger.info("CONVERTED ENTITY MENTION: " + convertedMention); AnnotationUtils.addEntityMention(sentence, convertedMention); entityMentionMap.put(aceEntityMention.getId(), convertedMention); // TODO: make Entity objects as needed } // convert relation mentions for (AceRelationMention aceRelationMention : relationMentions) { RelationMention convertedMention = convertAceRelationMention(aceRelationMention, docId, sentence, entityMentionMap); if(convertedMention != null){ relationCounts.incrementCount(convertedMention.getType()); logger.info("CONVERTED RELATION MENTION: " + convertedMention); AnnotationUtils.addRelationMention(sentence, convertedMention); } // TODO: make Relation objects } // convert EventMentions for(AceEventMention aceEventMention: eventMentions){ EventMention convertedMention = convertAceEventMention(aceEventMention, docId, sentence, entityMentionMap, tokenOffset); if(convertedMention != null){ eventCounts.incrementCount(convertedMention.getType()); logger.info("CONVERTED EVENT MENTION: " + convertedMention); AnnotationUtils.addEventMention(sentence, convertedMention); } // TODO: make Event objects } results.add(sentence); tokenOffset += tokens.size(); } return results; } private EventMention convertAceEventMention( AceEventMention aceEventMention, String docId, CoreMap sentence, Map<String, EntityMention> entityMap, int tokenOffset) { Set<String> roleSet = aceEventMention.getRoles(); List<String> roles = new ArrayList<>(); for(String role: roleSet) roles.add(role); List<ExtractionObject> convertedArgs = new ArrayList<>(); int left = Integer.MAX_VALUE; int right = Integer.MIN_VALUE; for(String role: roles){ AceEntityMention arg = aceEventMention.getArg(role); ExtractionObject o = entityMap.get(arg.getId()); if(o == null){ logger.severe("READER ERROR: Failed to find event argument with id " + arg.getId()); logger.severe("This happens because a few event mentions illegally span multiple sentences. Will ignore this mention."); return null; } convertedArgs.add(o); if(o.getExtentTokenStart() < left) left = o.getExtentTokenStart(); if(o.getExtentTokenEnd() > right) right = o.getExtentTokenEnd(); } AceCharSeq anchor = aceEventMention.getAnchor(); ExtractionObject anchorObject = new ExtractionObject( aceEventMention.getId() + "-anchor", sentence, new Span(anchor.getTokenStart() - tokenOffset, anchor.getTokenEnd() + 1 - tokenOffset), "ANCHOR", null); EventMention em = new EventMention( aceEventMention.getId(), sentence, new Span(left, right), aceEventMention.getParent().getType(), aceEventMention.getParent().getSubtype(), anchorObject, convertedArgs, roles); return em; } private RelationMention convertAceRelationMention(AceRelationMention aceRelationMention, String docId, CoreMap sentence, Map<String, EntityMention> entityMap) { List<AceRelationMentionArgument> args = Arrays.asList(aceRelationMention.getArgs()); List<ExtractionObject> convertedArgs = new ArrayList<>(); List<String> argNames = new ArrayList<>(); // the arguments are already stored in semantic order. Make sure we preserve the same ordering! int left = Integer.MAX_VALUE; int right = Integer.MIN_VALUE; for (AceRelationMentionArgument arg : args) { ExtractionObject o = entityMap.get(arg.getContent().getId()); if(o == null){ logger.severe("READER ERROR: Failed to find relation argument with id " + arg.getContent().getId()); logger.severe("This happens because a few relation mentions illegally span multiple sentences. Will ignore this mention."); return null; } convertedArgs.add(o); argNames.add(arg.getRole()); if(o.getExtentTokenStart() < left) left = o.getExtentTokenStart(); if(o.getExtentTokenEnd() > right) right = o.getExtentTokenEnd(); } if(argNames.size() != 2 || ! argNames.get(0).equalsIgnoreCase("arg-1") || ! argNames.get(1).equalsIgnoreCase("arg-2")){ logger.severe("READER ERROR: Invalid succession of arguments in relation mention: " + argNames); logger.severe("ACE relations must have two arguments. Will ignore this mention."); return null; } RelationMention relation = new RelationMention( aceRelationMention.getId(), sentence, new Span(left, right), aceRelationMention.getParent().getType(), aceRelationMention.getParent().getSubtype(), convertedArgs, null); return relation; } /** * Convert an {@link AceEntityMention} to an {@link EntityMention}. * * @param entityMention {@link AceEntityMention} to convert * @param docId ID of the document containing this entity mention * @param sentence * @param tokenOffset An offset in the calculations of position of the extent to sentence boundary * (the ace.reader stores absolute token offset from the beginning of the document, but * we need token offsets from the beginning of the sentence => adjust by tokenOffset) * @return entity as an {@link EntityMention} */ private EntityMention convertAceEntityMention(AceEntityMention entityMention, String docId, CoreMap sentence, int tokenOffset) { //log.info("TYPE is " + entityMention.getParent().getType()); //log.info("SUBTYPE is " + entityMention.getParent().getSubtype()); //log.info("LDCTYPE is " + entityMention.getLdctype()); AceCharSeq ext = entityMention.getExtent(); AceCharSeq head = entityMention.getHead(); int extStart = ext.getTokenStart() - tokenOffset; int extEnd = ext.getTokenEnd() - tokenOffset + 1; if (extStart < 0) { logger.severe("READER ERROR: Invalid extent start " + extStart + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence); logger.severe("This may happen due to incorrect EOS detection. Adjusting entity extent."); extStart = 0; } if (extEnd > sentence.get(CoreAnnotations.TokensAnnotation.class).size()) { logger.severe("READER ERROR: Invalid extent end " + extEnd + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence); logger.severe("This may happen due to incorrect EOS detection. Adjusting entity extent."); extEnd = sentence.get(CoreAnnotations.TokensAnnotation.class).size(); } int headStart = head.getTokenStart() - tokenOffset; int headEnd = head.getTokenEnd() - tokenOffset + 1; if (headStart < 0) { logger.severe("READER ERROR: Invalid head start " + headStart + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence); logger.severe("This may happen due to incorrect EOS detection. Adjusting entity head span."); headStart = 0; } if(headEnd > sentence.get(CoreAnnotations.TokensAnnotation.class).size()){ logger.severe("READER ERROR: Invalid head end " + headEnd + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence); logger.severe("This may happen due to incorrect EOS detection. Adjusting entity head span."); headEnd = sentence.get(CoreAnnotations.TokensAnnotation.class).size(); } // must adjust due to possible incorrect EOS detection if(headStart < extStart){ headStart = extStart; } if(headEnd > extEnd){ headEnd = extEnd; } assert(headStart < headEnd); // note: the ace.reader stores absolute token offset from the beginning of the document, but // we need token offsets from the beginning of the sentence => adjust by tokenOffset // note: in ace.reader the end token position is inclusive, but // in our setup the end token position is exclusive => add 1 to end EntityMention converted = new EntityMention( entityMention.getId(), sentence, new Span(extStart, extEnd), new Span(headStart, headEnd), entityMention.getParent().getType(), entityMention.getParent().getSubtype(), entityMention.getLdctype()); return converted; } private EntityMention convertAceEntityMention(AceEntityMention entityMention, String docId, CoreMap sentence, int tokenOffset, String corefID) { EntityMention converted = convertAceEntityMention(entityMention, docId, sentence, tokenOffset); converted.setCorefID(corefID); return converted; } // simple testing code public static void main(String[] args) throws IOException { Properties props = StringUtils.argsToProperties(args); AceReader r = new AceReader(new StanfordCoreNLP(props, false), false); r.setLoggerLevel(Level.INFO); r.parse("/scr/nlp/data/ACE2005/"); // Annotation a = r.parse("/user/mengqiu/scr/twitter/nlp/corpus_prep/standalone/ar/data"); // BasicEntityExtractor.saveCoNLLFiles("/tmp/conll", a, false, false); log.info("done"); } }