package edu.jhu.agiga;
import static edu.jhu.agiga.AgigaSentenceReader.require;
import java.io.IOException;
import java.io.Serializable;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.PriorityQueue;
import java.util.logging.Logger;
/**
* AgigaDocument provides access to the AgigaSentence and AgigaCoref objects,
* the document id, and the document type. This class also provides a method for
* writing out the coreference resolution annotations in a MUC style SGML output
* format.
*
* @author mgormley
*
*/
public class AgigaDocument implements Serializable {
public static final long serialVersionUID = 1;
private static Logger log = Logger.getLogger(AgigaDocument.class.getName());
private String docId;
private String type;
private String headline;
private String dateline;
private List<AgigaSentence> sents;
private List<AgigaCoref> corefs;
private AgigaPrefs prefs;
public AgigaDocument(AgigaPrefs prefs) {
this.prefs = prefs;
sents = new ArrayList<AgigaSentence>();
if (prefs.readCoref) {
corefs = new ArrayList<AgigaCoref>();
}
}
public void add(AgigaSentence agigaSent) {
sents.add(agigaSent);
}
public void setCorefs(List<AgigaCoref> corefs) {
this.corefs = corefs;
}
public List<AgigaSentence> getSents() {
return sents;
}
public void setSents(List<AgigaSentence> sents) {
this.sents = sents;
}
public List<AgigaCoref> getCorefs() {
return corefs;
}
public AgigaPrefs getPrefs() {
return prefs;
}
public String getDocId() {
return docId;
}
public void setDocId(String id) {
this.docId = id;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getHeadline() {
return headline;
}
public void setHeadline(String headline) {
this.headline = headline;
}
public String getDateline() {
return dateline;
}
public void setDateline(String dateline) {
this.dateline = dateline;
}
// TODO: should this move to an external class? it might be tricky to do so
public void writeMucStyleCoref(Writer writer) throws IOException {
require(prefs.readWord && prefs.readCoref,
"AgigaPrefs.{readWord,readCoref} must be true for writeMucStyleCoref()");
assignMucStyleIdsAndRefsToMentions();
AgigaMention[] mentionArray = getAllMentions().toArray(new AgigaMention[0]);
Arrays.sort(mentionArray, new StartMentionComparator());
LinkedList<AgigaMention> mentionStarts = new LinkedList<AgigaMention>(Arrays.asList(mentionArray));
log.finer("Total number of mentions: " + mentionStarts.size());
PriorityQueue<AgigaMention> mentionEnds = new PriorityQueue<AgigaMention>(11, new EndMentionComparator());
log.finer("Number of sentences: " + sents.size());
for (int s=0; s<sents.size(); s++) {
AgigaSentence sent = sents.get(s);
List<AgigaToken> tokens = sent.getTokens();
log.finer("Number of tokens: " + tokens.size());
for (int i=0; i<tokens.size()+1; i++) {
while (mentionEnds.size() > 0 && mentionEnds.peek().getSentenceIdx() == s && mentionEnds.peek().getEndTokenIdx() == i) {
mentionEnds.remove();
writer.write("</COREF>");
}
if (i > 0 && i < tokens.size()) {
writer.write(" ");
}
if (mentionEnds.size() > 0 && (mentionEnds.peek().getSentenceIdx() < s
|| (mentionEnds.peek().getSentenceIdx() == s && mentionEnds.peek().getEndTokenIdx() < i))) {
writer.flush();
log.severe("mentionEnds: " + mentionEnds);
throw new RuntimeException(String.format("Overlapping coref elements. s=%d i=%d", s, i));
}
while (mentionStarts.size() > 0 && mentionStarts.peek().getSentenceIdx() == s && mentionStarts.peek().getStartTokenIdx() == i) {
AgigaMention head = mentionStarts.pop();
if (head.isRepresentative()) {
writer.write(String.format("<COREF ID=%d>", head.getMucId()));
} else {
writer.write(String.format("<COREF ID=%d REF=%d>", head.getMucId(), head.getMucRef()));
}
mentionEnds.add(head);
}
if (i >= tokens.size()) {
break;
}
AgigaToken tok = tokens.get(i);
writer.write(tok.getWord());
}
require(mentionEnds.size() == 0);
writer.write("\n");
}
writer.write("\n");
}
@Override
public boolean equals(Object other) {
if(other == null) return false;
if(other instanceof AgigaDocument) {
AgigaDocument o = (AgigaDocument) other;
return Util.safeEquals(docId, o.docId)
&& Util.safeEquals(type, o.type)
&& Util.safeEquals(headline, o.headline)
&& Util.safeEquals(dateline, o.dateline)
&& Util.safeEquals(sents, o.sents)
&& Util.safeEquals(corefs, o.corefs);
//&& Util.safeEquals(prefs, o.prefs);
}
return false;
}
@Override
public int hashCode() {
return Util.safeHashCode(docId, type, headline, dateline, sents, corefs, prefs);
}
private static class StartMentionComparator implements Comparator<AgigaMention> {
@Override
public int compare(AgigaMention m1, AgigaMention m2) {
int val = m1.getSentenceIdx() - m2.getSentenceIdx();
if (val != 0) {
return val;
}
val = m1.getStartTokenIdx() - m2.getStartTokenIdx();
if (val != 0) {
return val;
}
// For overlapping mentions starting at the same token, we want the
// later one to start on the left
val = m2.getEndTokenIdx() - m1.getEndTokenIdx();
return val;
}
}
private static class EndMentionComparator implements Comparator<AgigaMention> {
@Override
public int compare(AgigaMention m1, AgigaMention m2) {
int val = m1.getSentenceIdx() - m2.getSentenceIdx();
if (val != 0) {
return val;
}
return m1.getEndTokenIdx() - m2.getEndTokenIdx();
}
}
private List<AgigaMention> getAllMentions() {
List<AgigaMention> allMentions = new ArrayList<AgigaMention>();
for (AgigaCoref coref : corefs) {
allMentions.addAll(coref.getMentions());
}
return allMentions;
}
private void assignMucStyleIdsAndRefsToMentions() {
// Create IDs and REFs as in MUC-7
int id = 0;
for (AgigaCoref coref : corefs) {
int representativeId = -1;
for (AgigaMention mention : coref.getMentions()) {
mention.setMucId(id++);
if (mention.isRepresentative()) {
representativeId = mention.getMucId();
}
}
for (AgigaMention mention : coref.getMentions()) {
if (!mention.isRepresentative()) {
mention.setMucRef(representativeId);
} else {
mention.setMucRef(AgigaMention.UNASSIGNED);
}
}
}
}
}