package edu.stanford.nlp.ie.machinereading.domains.roth; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.logging.Level; import java.util.logging.Logger; import edu.stanford.nlp.ie.machinereading.GenericDataSetReader; import edu.stanford.nlp.ie.machinereading.structure.AnnotationUtils; import edu.stanford.nlp.ie.machinereading.structure.EntityMention; import edu.stanford.nlp.ie.machinereading.structure.ExtractionObject; import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations; import edu.stanford.nlp.ie.machinereading.structure.RelationMention; import edu.stanford.nlp.ie.machinereading.structure.Span; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.util.StringUtils; /** * A Reader designed for the relation extraction data studied in Dan Roth and Wen-tau Yih, * A Linear Programming Formulation for Global Inference in Natural Language Tasks. CoNLL 2004. * The format is a somewhat ad-hoc tab-separated value file format. * * @author Mihai, David McClosky, and agusev * @author Sonal Gupta (sonalg@stanford.edu) */ public class RothCONLL04Reader extends GenericDataSetReader { public RothCONLL04Reader() { super(null, true, true, true); // change the logger to one from our namespace logger = Logger.getLogger(RothCONLL04Reader.class.getName()); // run quietly by default logger.setLevel(Level.SEVERE); } @Override public Annotation read(String path) throws IOException { Annotation doc = new Annotation(""); logger.info("Reading file: " + path); // Each iteration through this loop processes a single sentence along with any relations in it for (Iterator<String> lineIterator = IOUtils.readLines(path).iterator(); lineIterator.hasNext(); ) { Annotation sentence = readSentence(path, lineIterator); AnnotationUtils.addSentence(doc, sentence); } return doc; } private boolean warnedNER; // = false; private String getNormalizedNERTag(String ner) { if (ner.equalsIgnoreCase("O")) { return "O"; } else if (ner.equalsIgnoreCase("Peop")) { return "PERSON"; } else if (ner.equalsIgnoreCase("Loc")) { return "LOCATION"; } else if(ner.equalsIgnoreCase("Org")) { return "ORGANIZATION"; } else if(ner.equalsIgnoreCase("Other")) { return "OTHER"; } else { if ( ! warnedNER) { warnedNER = true; logger.warning("This file contains NER tags not in the original Roth/Yih dataset, e.g.: " + ner); } } throw new RuntimeException("Cannot normalize ner tag " + ner); } private Annotation readSentence(String docId, Iterator<String> lineIterator) { Annotation sentence = new Annotation(""); sentence.set(CoreAnnotations.DocIDAnnotation.class, docId); sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, new ArrayList<>()); // we'll need to set things like the tokens and textContent after we've // fully read the sentence // contains the full text that we've read so far StringBuilder textContent = new StringBuilder(); int tokenCount = 0; // how many tokens we've seen so far List<CoreLabel> tokens = new ArrayList<>(); // when we've seen two blank lines in a row, this sentence is over (one // blank line separates the sentence and the relations int numBlankLinesSeen = 0; String sentenceID = null; // keeps tracks of entities we've seen so far for use by relations Map<String, EntityMention> indexToEntityMention = new HashMap<>(); while (lineIterator.hasNext() && numBlankLinesSeen < 2) { String currentLine = lineIterator.next(); currentLine = currentLine.replace("COMMA", ","); List<String> pieces = StringUtils.split(currentLine); String identifier; int size = pieces.size(); switch (size) { case 1: // blank line between sentences or relations numBlankLinesSeen++; break; case 3: // relation String type = pieces.get(2); List<ExtractionObject> args = new ArrayList<>(); EntityMention entity1 = indexToEntityMention.get(pieces.get(0)); EntityMention entity2 = indexToEntityMention.get(pieces.get(1)); args.add(entity1); args.add(entity2); Span span = new Span(entity1.getExtentTokenStart(), entity2 .getExtentTokenEnd()); // identifier = "relation" + sentenceID + "-" + sentence.getAllRelations().size(); identifier = RelationMention.makeUniqueId(); RelationMention relationMention = new RelationMention(identifier, sentence, span, type, null, args); AnnotationUtils.addRelationMention(sentence, relationMention); break; case 9: // token /* * Roth token lines look like this: * * 19 Peop 9 O NNP/NNP Jamal/Ghosheh O O O */ // Entities may be multiple words joined by '/'; we split these up List<String> words = StringUtils.split(pieces.get(5), "/"); //List<String> postags = StringUtils.split(pieces.get(4),"/"); String text = StringUtils.join(words, " "); identifier = "entity" + pieces.get(0) + '-' + pieces.get(2); String nerTag = getNormalizedNERTag(pieces.get(1)); // entity type of the word/expression if (sentenceID == null) sentenceID = pieces.get(0); if (!nerTag.equals("O")) { Span extentSpan = new Span(tokenCount, tokenCount + words.size()); // Temporarily sets the head span to equal the extent span. // This is so the entity has a head (in particular, getValue() works) even if preprocessSentences isn't called. // The head span is later modified if preprocessSentences is called. EntityMention entity = new EntityMention(identifier, sentence, extentSpan, extentSpan, nerTag, null, null); AnnotationUtils.addEntityMention(sentence, entity); // we can get by using these indices as strings since we only use them // as a hash key String index = pieces.get(2); indexToEntityMention.put(index, entity); } // int i =0; for (String word : words) { CoreLabel label = new CoreLabel(); label.setWord(word); //label.setTag(postags.get(i)); label.set(CoreAnnotations.TextAnnotation.class, word); label.set(CoreAnnotations.ValueAnnotation.class, word); // we don't set TokenBeginAnnotation or TokenEndAnnotation since we're // not keeping track of character offsets tokens.add(label); // i++; } textContent.append(text); textContent.append(' '); tokenCount += words.size(); break; } } sentence.set(CoreAnnotations.TextAnnotation.class, textContent.toString()); sentence.set(CoreAnnotations.ValueAnnotation.class, textContent.toString()); sentence.set(CoreAnnotations.TokensAnnotation.class, tokens); sentence.set(CoreAnnotations.SentenceIDAnnotation.class, sentenceID); return sentence; } /* * Gets the index of an object in a list using == to test (List.indexOf uses * equals() which could be problematic here) */ private static <X> int getIndexByObjectEquality(List<X> list, X obj) { for (int i = 0, sz = list.size(); i < sz; i++) { if (list.get(i) == obj) { return i; } } return -1; } /* * Sets the head word and the index for an entity, given the parse tree for * the sentence containing the entity. * * This code is no longer used, but I've kept it around (at least for now) as * reference when we modify preProcessSentences(). */ @SuppressWarnings("unused") private void setHeadWord(EntityMention entity, Tree tree) { List<Tree> leaves = tree.getLeaves(); Tree argRoot = tree.joinNode(leaves.get(entity.getExtentTokenStart()), leaves.get(entity.getExtentTokenEnd())); Tree headWordNode = argRoot.headTerminal(headFinder); int headWordIndex = getIndexByObjectEquality(leaves, headWordNode); if (StringUtils.isPunct(leaves.get(entity.getExtentTokenEnd()).label().value().trim()) && (headWordIndex >= entity.getExtentTokenEnd() || headWordIndex < entity.getExtentTokenStart())) { argRoot = tree.joinNode(leaves.get(entity.getExtentTokenStart()), leaves .get(entity.getExtentTokenEnd() - 1)); headWordNode = argRoot.headTerminal(headFinder); headWordIndex = getIndexByObjectEquality(leaves, headWordNode); if (headWordIndex >= entity.getExtentTokenStart() && headWordIndex <= entity.getExtentTokenEnd() - 1) { entity.setHeadTokenPosition(headWordIndex); entity.setHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1)); } } if (headWordIndex >= entity.getExtentTokenStart() && headWordIndex <= entity.getExtentTokenEnd()) { entity.setHeadTokenPosition(headWordIndex); entity.setHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1)); } else { // Re-parse the argument words by themselves // Get the list of words in the arg by looking at the leaves between // arg.getExtentTokenStart() and arg.getExtentTokenEnd() inclusive List<String> argWords = new ArrayList<>(); for (int i = entity.getExtentTokenStart(); i <= entity.getExtentTokenEnd(); i++) { argWords.add(leaves.get(i).label().value()); } if (StringUtils.isPunct(argWords.get(argWords.size() - 1))) { argWords.remove(argWords.size() - 1); } Tree argTree = parseStrings(argWords); headWordNode = argTree.headTerminal(headFinder); headWordIndex = getIndexByObjectEquality(argTree.getLeaves(), headWordNode) + entity.getExtentTokenStart(); entity.setHeadTokenPosition(headWordIndex); entity.setHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1)); } } public static void main(String[] args) throws Exception { // just a simple test, to make sure stuff works Properties props = StringUtils.argsToProperties(args); RothCONLL04Reader reader = new RothCONLL04Reader(); reader.setLoggerLevel(Level.INFO); reader.setProcessor(new StanfordCoreNLP(props)); Annotation doc = reader.parse("/u/nlp/data/RothCONLL04/conll04.corp"); System.out.println(AnnotationUtils.datasetToString(doc)); } }