package edu.stanford.nlp.ie.machinereading.domains.ace.reader; import edu.stanford.nlp.util.logging.Redwood; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Vector; import java.util.logging.Logger; import edu.stanford.nlp.ie.machinereading.common.SimpleTokenize; import edu.stanford.nlp.ie.machinereading.domains.ace.AceReader; import edu.stanford.nlp.util.Generics; /** * Stores the ACE elements annotated in this document */ public class AceDocument extends AceElement { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(AceDocument.class); /** Prefix of the files from where this doc was created */ private String mPrefix; /** Value of the SOURCE XML field */ private String mSource; /** All entities */ private Map<String, AceEntity> mEntities; /** All entity mentions */ private Map<String, AceEntityMention> mEntityMentions; /** All entity mentions in a given sentence, sorted in textual order */ private ArrayList<ArrayList<AceEntityMention>> mSentenceEntityMentions; /** All relations */ private Map<String, AceRelation> mRelations; /** All relation mentions */ private Map<String, AceRelationMention> mRelationMentions; /** All relation mentions in a given sentence, sorted in textual order */ private ArrayList<ArrayList<AceRelationMention>> mSentenceRelationMentions; /** All events */ private Map<String, AceEvent> mEvents; /** All event mentions */ private Map<String, AceEventMention> mEventMentions; /** All event mentions in a given sentence, sorted in textual order */ private ArrayList<ArrayList<AceEventMention>> mSentenceEventMentions; /** The list of all tokens in the document, sorted in textual order */ private Vector<AceToken> mTokens; /** List of all sentences in the document */ private List<List<AceToken>> mSentences; /** The raw byte document, no preprocessing */ private String mRawBuffer; static Logger mLog = Logger.getLogger(AceReader.class.getName()); public AceDocument(String id) { super(id); mEntities = Generics.newHashMap(); mEntityMentions = Generics.newHashMap(); mSentenceEntityMentions = new ArrayList<>(); mRelations = Generics.newHashMap(); mRelationMentions = Generics.newHashMap(); mSentenceRelationMentions = new ArrayList<>(); mEvents = Generics.newHashMap(); mEventMentions = Generics.newHashMap(); mSentenceEventMentions = new ArrayList<>(); mTokens = new Vector<>(); } public void setPrefix(String p) { mPrefix = p; setSource(mPrefix); } public String getPrefix() { return mPrefix; } public void setSource(String p) { if (p.indexOf("bc/") >= 0) mSource = "broadcast conversation"; else if (p.indexOf("bn/") >= 0) mSource = "broadcast news"; else if (p.indexOf("cts/") >= 0) mSource = "telephone"; else if (p.indexOf("nw/") >= 0) mSource = "newswire"; else if (p.indexOf("un/") >= 0) mSource = "usenet"; else if (p.indexOf("wl/") >= 0) mSource = "weblog"; else { log.info("WARNING: Unknown source for doc: " + p); mSource = "none"; } } public int getSentenceCount() { return mSentenceEntityMentions.size(); } public ArrayList<AceEntityMention> getEntityMentions(int sent) { return mSentenceEntityMentions.get(sent); } public ArrayList<ArrayList<AceEntityMention>> getAllEntityMentions() { return mSentenceEntityMentions; } public ArrayList<AceRelationMention> getRelationMentions(int sent) { return mSentenceRelationMentions.get(sent); } public ArrayList<ArrayList<AceRelationMention>> getAllRelationMentions() { return mSentenceRelationMentions; } public ArrayList<AceEventMention> getEventMentions(int sent) { return mSentenceEventMentions.get(sent); } public ArrayList<ArrayList<AceEventMention>> getAllEventMentions() { return mSentenceEventMentions; } public AceEntity getEntity(String id) { return mEntities.get(id); } public Set<String> getKeySetEntities() { return mEntities.keySet(); } public void addEntity(AceEntity e) { mEntities.put(e.getId(), e); } public Map<String, AceEntityMention> getEntityMentions() { return mEntityMentions; } public AceEntityMention getEntityMention(String id) { return mEntityMentions.get(id); } public void addEntityMention(AceEntityMention em) { mEntityMentions.put(em.getId(), em); } public AceRelation getRelation(String id) { return mRelations.get(id); } public void addRelation(AceRelation r) { mRelations.put(r.getId(), r); } public Map<String, AceRelationMention> getRelationMentions() { return mRelationMentions; } public AceRelationMention getRelationMention(String id) { return mRelationMentions.get(id); } public void addRelationMention(AceRelationMention e) { mRelationMentions.put(e.getId(), e); } public AceEvent getEvent(String id) { return mEvents.get(id); } public void addEvent(AceEvent r) { mEvents.put(r.getId(), r); } public Map<String, AceEventMention> getEventMentions() { return mEventMentions; } public AceEventMention getEventMention(String id) { return mEventMentions.get(id); } public void addEventMention(AceEventMention e) { mEventMentions.put(e.getId(), e); } public void addToken(AceToken t) { mTokens.add(t); } public int getTokenCount() { return mTokens.size(); } public AceToken getToken(int i) { return mTokens.get(i); } public List<AceToken> getSentence(int index) { return mSentences.get(index); } public List<List<AceToken>> getSentences() { return mSentences; } public void setSentences(List<List<AceToken>> sentences) { mSentences = sentences; } public String toString() { return toXml(0); } public String toXml(int offset) { StringBuffer buffer = new StringBuffer(); appendOffset(buffer, offset); buffer.append("<?xml version=\"1.0\"?>\n"); appendOffset(buffer, offset); buffer.append("<!DOCTYPE source_file SYSTEM \"apf.v5.1.2.dtd\">\n"); appendOffset(buffer, offset); buffer.append("<source_file URI=\"" + mId + ".sgm\" SOURCE=\"" + mSource + "\" TYPE=\"text\" AUTHOR=\"LDC\" ENCODING=\"UTF-8\">\n"); appendOffset(buffer, offset); buffer.append("<document DOCID=\"" + getId() + "\">\n"); // display all entities Set<String> entKeys = mEntities.keySet(); for (String key : entKeys) { AceEntity e = mEntities.get(key); buffer.append(e.toXml(offset)); buffer.append("\n"); } // display all relations Set<String> relKeys = mRelations.keySet(); for (String key : relKeys) { AceRelation r = mRelations.get(key); if (!r.getType().equals(AceRelation.NIL_LABEL)) { buffer.append(r.toXml(offset)); buffer.append("\n"); } } // TODO: display all events appendOffset(buffer, offset); buffer.append("</document>\n"); appendOffset(buffer, offset); buffer.append("</source_file>\n"); return buffer.toString(); } private String tokensWithByteSpan(int start, int end) { StringBuffer buf = new StringBuffer(); boolean doPrint = false; buf.append("..."); for (AceToken mToken : mTokens) { // start printing if (doPrint == false && mToken.getByteOffset().start() > start - 20 && mToken.getByteOffset().end() < end) { doPrint = true; } // end printing else if (doPrint == true && mToken.getByteOffset().start() > end + 20) { doPrint = false; } if (doPrint) { buf.append(" " + mToken.display()); } } buf.append("..."); return buf.toString(); } /** * Matches all relevant mentions, i.e. entities and anchors, to tokens Note: * entity mentions may match with multiple tokens! */ public void matchCharSeqs(String filePrefix) { // // match the head and extent of entity mentions // Set<String> keys = mEntityMentions.keySet(); for (String key : keys) { AceEntityMention m = mEntityMentions.get(key); // // match the head charseq to 1+ phrase(s) // try { m.getHead().match(mTokens); } catch (MatchException e) { mLog.severe("READER ERROR: Failed to match entity mention head: " + "[" + m.getHead().getText() + ", " + m.getHead().getByteStart() + ", " + m.getHead().getByteEnd() + "]"); mLog.severe("Document tokens: " + tokensWithByteSpan(m.getHead().getByteStart(), m.getHead().getByteEnd())); mLog.severe("Document prefix: " + filePrefix); System.exit(1); } // // match the extent charseq to 1+ phrase(s) // try { m.getExtent().match(mTokens); } catch (MatchException e) { mLog.severe("READER ERROR: Failed to match entity mention extent: " + "[" + m.getExtent().getText() + ", " + m.getExtent().getByteStart() + ", " + m.getExtent().getByteEnd() + "]"); mLog.severe("Document tokens: " + tokensWithByteSpan(m.getExtent().getByteStart(), m.getExtent().getByteEnd())); System.exit(1); } // // set the head word of the mention // m.detectHeadToken(this); } // we need to do this for events as well since they may not have any AceEntityMentions associated with them (if they have no arguments) Set<String> eventKeys = mEventMentions.keySet(); for (String key : eventKeys) { AceEventMention m = mEventMentions.get(key); // // match the extent charseq to 1+ phrase(s) // try { m.getExtent().match(mTokens); } catch (MatchException e) { mLog.severe("READER ERROR: Failed to match event mention extent: " + "[" + m.getExtent().getText() + ", " + m.getExtent().getByteStart() + ", " + m.getExtent().getByteEnd() + "]"); mLog.severe("Document tokens: " + tokensWithByteSpan(m.getExtent().getByteStart(), m.getExtent().getByteEnd())); System.exit(1); } } } public static final String XML_EXT = ".apf.xml"; public static final String ORIG_EXT = ".sgm"; /** * Parses an ACE document. Works in the following steps: (a) reads both the * XML annotations; (b) reads the tokens; (c) matches the tokens against the * annotations (d) constructs mSentenceEntityMentions and * mRelationEntityMentions */ public static AceDocument parseDocument(String prefix, boolean usePredictedBoundaries) throws java.io.IOException, org.xml.sax.SAXException, javax.xml.parsers.ParserConfigurationException { mLog.fine("Reading document " + prefix); AceDocument doc = null; // // read the ACE XML annotations // if (usePredictedBoundaries == false) { doc = AceDomReader.parseDocument(new File(prefix + XML_EXT)); // log.info("Parsed " + doc.getEntityMentions().size() + // " entities in document " + prefix); } // // will use the predicted entity boundaries (see below) // else { int lastSlash = prefix.lastIndexOf(File.separator); assert (lastSlash > 0 && lastSlash < prefix.length() - 1); String id = prefix.substring(lastSlash + 1); // log.info(id + ": " + prefix); doc = new AceDocument(id); } doc.setPrefix(prefix); // // read the raw byte stream // String trueCasedFileName = prefix + ORIG_EXT + ".truecase"; if((new File(trueCasedFileName).exists())){ mLog.severe("Using truecased file: " + trueCasedFileName); doc.readRawBytes(trueCasedFileName); } else { doc.readRawBytes(prefix + ORIG_EXT); } // // read the AceTokens // int offsetToSubtract = 0; List<List<AceToken>> sentences = AceSentenceSegmenter.tokenizeAndSegmentSentences(prefix); doc.setSentences(sentences); for (List<AceToken> sentence : sentences) { for (AceToken token : sentence) { offsetToSubtract = token.adjustPhrasePositions(offsetToSubtract, token.getLiteral()); doc.addToken(token); } } // // match char sequences to phrases // doc.matchCharSeqs(prefix); // // construct the mEntityMentions matrix // Set<String> entityKeys = doc.mEntityMentions.keySet(); int sentence; for (String key : entityKeys) { AceEntityMention em = doc.mEntityMentions.get(key); sentence = doc.mTokens.get(em.getHead().getTokenStart()).getSentence(); // adjust the number of rows if necessary while (sentence >= doc.mSentenceEntityMentions.size()) { doc.mSentenceEntityMentions.add(new ArrayList<>()); doc.mSentenceRelationMentions.add(new ArrayList<>()); doc.mSentenceEventMentions.add(new ArrayList<>()); } // store the entity mentions in increasing order: // (a) of the start position of their head // (b) if start is the same, in increasing order of the head end ArrayList<AceEntityMention> sentEnts = doc.mSentenceEntityMentions.get(sentence); boolean added = false; for (int i = 0; i < sentEnts.size(); i++) { AceEntityMention crt = sentEnts.get(i); if ((crt.getHead().getTokenStart() > em.getHead().getTokenStart()) || (crt.getHead().getTokenStart() == em.getHead().getTokenStart() && crt.getHead().getTokenEnd() > em .getHead().getTokenEnd())) { sentEnts.add(i, em); added = true; break; } } if (!added) { sentEnts.add(em); } } // // construct the mRelationMentions matrix // Set<String> relKeys = doc.mRelationMentions.keySet(); for (String key : relKeys) { AceRelationMention rm = doc.mRelationMentions.get(key); sentence = doc.mTokens.get(rm.getArg(0).getHead().getTokenStart()).getSentence(); // // no need to adjust the number of rows: was done above // // store the relation mentions in increasing order // (a) of the start position of their head, or // (b) if start is the same, in increasing order of ends ArrayList<AceRelationMention> sentRels = doc.mSentenceRelationMentions.get(sentence); boolean added = false; for (int i = 0; i < sentRels.size(); i++) { AceRelationMention crt = sentRels.get(i); if ((crt.getMinTokenStart() > rm.getMinTokenStart()) || (crt.getMinTokenStart() == rm.getMinTokenStart() && crt.getMaxTokenEnd() > rm.getMaxTokenEnd())) { sentRels.add(i, rm); added = true; break; } } if (!added) { sentRels.add(rm); } } // // construct the mEventMentions matrix // Set<String> eventKeys = doc.mEventMentions.keySet(); for (String key : eventKeys) { AceEventMention em = doc.mEventMentions.get(key); sentence = doc.mTokens.get(em.getMinTokenStart()).getSentence(); /* * adjust the number of rows if necessary -- if you're wondering why we do * this here again, (after we've done it for entities) it's because we can * have an event with no entities near the end of the document and thus * won't have created rows in mSentence*Mentions */ while (sentence >= doc.mSentenceEntityMentions.size()) { doc.mSentenceEntityMentions.add(new ArrayList<>()); doc.mSentenceRelationMentions.add(new ArrayList<>()); doc.mSentenceEventMentions.add(new ArrayList<>()); } // store the event mentions in increasing order // (a) first, event mentions with no arguments // (b) then by the start position of their head, or // (c) if start is the same, in increasing order of ends ArrayList<AceEventMention> sentEvents = doc.mSentenceEventMentions.get(sentence); boolean added = false; for (int i = 0; i < sentEvents.size(); i++) { AceEventMention crt = sentEvents.get(i); if ((crt.getMinTokenStart() > em.getMinTokenStart()) || (crt.getMinTokenStart() == em.getMinTokenStart() && crt.getMaxTokenEnd() > em.getMaxTokenEnd())) { sentEvents.add(i, em); added = true; break; } } if (!added) { sentEvents.add(em); } } return doc; } // // heeyoung : skip relation, event parsing part - for ACE2004 // public static AceDocument parseDocument(String prefix, boolean usePredictedBoundaries, String AceVersion) throws java.io.IOException, org.xml.sax.SAXException, javax.xml.parsers.ParserConfigurationException { mLog.fine("Reading document " + prefix); AceDocument doc = null; // // read the ACE XML annotations // if (usePredictedBoundaries == false) { doc = AceDomReader.parseDocument(new File(prefix + XML_EXT)); // log.info("Parsed " + doc.getEntityMentions().size() + // " entities in document " + prefix); } // // will use the predicted entity boundaries (see below) // else { int lastSlash = prefix.lastIndexOf(File.separator); assert (lastSlash > 0 && lastSlash < prefix.length() - 1); String id = prefix.substring(lastSlash + 1); // log.info(id + ": " + prefix); doc = new AceDocument(id); } doc.setPrefix(prefix); // // read the raw byte stream // String trueCasedFileName = prefix + ORIG_EXT + ".truecase"; if((new File(trueCasedFileName).exists())){ mLog.severe("Using truecased file: " + trueCasedFileName); doc.readRawBytes(trueCasedFileName); } else { doc.readRawBytes(prefix + ORIG_EXT); } // // read the AceTokens // int offsetToSubtract = 0; List<List<AceToken>> sentences = AceSentenceSegmenter.tokenizeAndSegmentSentences(prefix); doc.setSentences(sentences); for (List<AceToken> sentence : sentences) { for (AceToken token : sentence) { offsetToSubtract = token.adjustPhrasePositions(offsetToSubtract, token.getLiteral()); doc.addToken(token); } } // // match char sequences to phrases // doc.matchCharSeqs(prefix); // // construct the mEntityMentions matrix // Set<String> entityKeys = doc.mEntityMentions.keySet(); int sentence; for (String key : entityKeys) { AceEntityMention em = doc.mEntityMentions.get(key); sentence = doc.mTokens.get(em.getHead().getTokenStart()).getSentence(); // adjust the number of rows if necessary while (sentence >= doc.mSentenceEntityMentions.size()) { doc.mSentenceEntityMentions.add(new ArrayList<>()); doc.mSentenceRelationMentions.add(new ArrayList<>()); doc.mSentenceEventMentions.add(new ArrayList<>()); } // store the entity mentions in increasing order: // (a) of the start position of their head // (b) if start is the same, in increasing order of the head end ArrayList<AceEntityMention> sentEnts = doc.mSentenceEntityMentions.get(sentence); boolean added = false; for (int i = 0; i < sentEnts.size(); i++) { AceEntityMention crt = sentEnts.get(i); if ((crt.getHead().getTokenStart() > em.getHead().getTokenStart()) || (crt.getHead().getTokenStart() == em.getHead().getTokenStart() && crt.getHead().getTokenEnd() > em .getHead().getTokenEnd())) { sentEnts.add(i, em); added = true; break; } } if (!added) { sentEnts.add(em); } } return doc; } // TODO: never used? public void constructSentenceRelationMentions() { // // construct the mRelationEntityMentions matrix // Set<String> relKeys = mRelationMentions.keySet(); for (String key : relKeys) { AceRelationMention rm = mRelationMentions.get(key); int sentence = mTokens.get(rm.getArg(0).getHead().getTokenStart()).getSentence(); // // no need to adjust the number of rows: was done in parseDocument // // store the relation mentions in increasing order // (a) of the start position of their head, or // (b) if start is the same, in increasing order of ends ArrayList<AceRelationMention> sentRels = mSentenceRelationMentions.get(sentence); boolean added = false; for (int i = 0; i < sentRels.size(); i++) { AceRelationMention crt = sentRels.get(i); if ((crt.getMinTokenStart() > rm.getMinTokenStart()) || (crt.getMinTokenStart() == rm.getMinTokenStart() && crt.getMaxTokenEnd() > rm.getMaxTokenEnd())) { sentRels.add(i, rm); added = true; break; } } if (!added) { sentRels.add(rm); } } } /** * Verifies if the two tokens are part of the same chunk */ public boolean sameChunk(int left, int right) { for (int i = right; i > left; i--) { String chunk = AceToken.OTHERS.get(getToken(i).getChunk()); if (!chunk.startsWith("I-")) return false; String word = AceToken.WORDS.get(getToken(i).getWord()); if (word.equals(",") || word.equals("(") || word.equals("-")) return false; } String leftChunk = AceToken.OTHERS.get(getToken(left).getChunk()); if (leftChunk.equals("O")) return false; return true; } public boolean isChunkHead(int pos) { String next = AceToken.OTHERS.get(getToken(pos + 1).getChunk()); if (next.startsWith("I-")) return false; return true; } public int findChunkEnd(int pos) { String crt = AceToken.OTHERS.get(getToken(pos).getChunk()); if (crt.equals("O")) return pos; for (pos = pos + 1; pos < getTokenCount(); pos++) { crt = AceToken.OTHERS.get(getToken(pos).getChunk()); if (!crt.startsWith("I-")) break; } return pos - 1; } public int findChunkStart(int pos) { String crt = AceToken.OTHERS.get(getToken(pos).getChunk()); if (crt.equals("O") || crt.startsWith("B-")) return pos; for (pos = pos - 1; pos >= 0; pos--) { crt = AceToken.OTHERS.get(getToken(pos).getChunk()); if (crt.startsWith("B-")) break; } return pos; } public boolean isApposition(int left, int right) { int leftEnd = findChunkEnd(left); int rightStart = findChunkStart(right); if (rightStart == leftEnd + 1) return true; if (rightStart == leftEnd + 2) { String comma = AceToken.WORDS.get(getToken(leftEnd + 1).getWord()); if (comma.equals(",") || comma.equals("-") || comma.equals("_")) { return true; } } return false; } public int countVerbs(int start, int end) { int count = 0; for (int i = start; i < end; i++) { String crt = AceToken.OTHERS.get(getToken(i).getPos()); if (crt.startsWith("VB")) count++; } return count; } public int countCommas(int start, int end) { int count = 0; for (int i = start; i < end; i++) { String crt = AceToken.WORDS.get(getToken(i).getWord()); if (crt.equals(",")) count++; } return count; } private void readRawBytes(String fileName) throws IOException { BufferedReader in = new BufferedReader(new FileReader(fileName)); StringBuffer buf = new StringBuffer(); int c; while ((c = in.read()) >= 0) buf.append((char) c); mRawBuffer = buf.toString(); // System.out.println(mRawBuffer); in.close(); } @SuppressWarnings("unused") private void readPredictedEntityBoundaries(BufferedReader is) throws java.io.IOException { // System.out.println("Reading boundaries from file: " + mPrefix); // // read Massi's B-ENT, I-ENT, or O labels // ArrayList<String> labels = new ArrayList<>(); String line; while ((line = is.readLine()) != null) { ArrayList<String> tokens = SimpleTokenize.tokenize(line); if (tokens.isEmpty() == false) labels.add(tokens.get(0)); } assert (labels.size() == mTokens.size()); int entityId = 1; // // traverse the label array and create entities as needed // for (int i = 0; i < labels.size(); i++) { // System.out.println(labels.get(i)); if (labels.get(i).startsWith("B-") || labels.get(i).startsWith("I-")) { // Massi's // ents // may // start // with // I-ENT int startToken = i; int endToken = i + 1; while (endToken < labels.size() && labels.get(endToken).startsWith("I-")) endToken++; // // Set the type/subtype to whatever Massi predicted // This is not directly used in this system. It is needed only // to generate the APF files with Massi info, which are needed // by Edgar. Otherwise type/subtype could be safely set to "none". // String label = labels.get(startToken); int dash = label.indexOf("-", 2); if (dash <= 2 || dash >= label.length()) { throw new RuntimeException(label); } assert (dash > 2 && dash < label.length() - 1); String type = label.substring(2, dash); String subtype = label.substring(dash + 1); /* * String type = "none"; String subtype = "none"; */ // create a new entity between [startToken, endToken) makeEntity(startToken, endToken, entityId, type, subtype); // skip over this entity i = endToken - 1; entityId++; } else { assert (labels.get(i).equals("O")); } } } public AceCharSeq makeCharSeq(int startToken, int endToken) { /* * StringBuffer buf = new StringBuffer(); for(int i = startToken; i < * endToken; i ++){ if(i > startToken) buf.append(" "); * buf.append(mTokens.get(i).getLiteral()); } */ startToken = Math.max(0, startToken); while (mTokens.get(startToken).getByteStart() < 0) // SGML token startToken++; endToken = Math.min(endToken, mTokens.size()); while (mTokens.get(endToken - 1).getByteStart() < 0) // SGML token endToken--; assert (endToken > startToken); String text = mRawBuffer.substring(mTokens.get(startToken).getRawByteStart(), mTokens.get(endToken - 1) .getRawByteEnd()); /* * if(mTokens.get(startToken).getByteStart() > mTokens.get(endToken - * 1).getByteEnd() - 1){ for(int i = startToken; i < endToken; i ++){ * System.out.println("Token: " + mTokens.get(i).display()); } } */ return new AceCharSeq(text, // buf.toString(), mTokens.get(startToken).getByteStart(), mTokens.get(endToken - 1).getByteEnd() - 1); } /** Makes an ACE entity from the span [startToken, endToken) */ private void makeEntity(int startToken, int endToken, int id, String type, String subtype) { String eid = mId + "-E" + id; AceEntity ent = new AceEntity(eid, type, subtype, "SPC"); addEntity(ent); AceCharSeq cseq = makeCharSeq(startToken, endToken); String emid = mId + "-E" + id + "-1"; AceEntityMention entm = new AceEntityMention(emid, "NOM", "NOM", cseq, cseq); addEntityMention(entm); ent.addMention(entm); } }