package edu.stanford.nlp.ie.machinereading.domains.ace.reader;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Vector;
import java.util.logging.Logger;
import edu.stanford.nlp.ie.machinereading.common.SimpleTokenize;
import edu.stanford.nlp.ie.machinereading.domains.ace.AceReader;
import edu.stanford.nlp.util.Generics;
/**
* Stores the ACE elements annotated in this document
*/
public class AceDocument extends AceElement {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(AceDocument.class);
/** Prefix of the files from where this doc was created */
private String mPrefix;
/** Value of the SOURCE XML field */
private String mSource;
/** All entities */
private Map<String, AceEntity> mEntities;
/** All entity mentions */
private Map<String, AceEntityMention> mEntityMentions;
/** All entity mentions in a given sentence, sorted in textual order */
private ArrayList<ArrayList<AceEntityMention>> mSentenceEntityMentions;
/** All relations */
private Map<String, AceRelation> mRelations;
/** All relation mentions */
private Map<String, AceRelationMention> mRelationMentions;
/** All relation mentions in a given sentence, sorted in textual order */
private ArrayList<ArrayList<AceRelationMention>> mSentenceRelationMentions;
/** All events */
private Map<String, AceEvent> mEvents;
/** All event mentions */
private Map<String, AceEventMention> mEventMentions;
/** All event mentions in a given sentence, sorted in textual order */
private ArrayList<ArrayList<AceEventMention>> mSentenceEventMentions;
/** The list of all tokens in the document, sorted in textual order */
private Vector<AceToken> mTokens;
/** List of all sentences in the document */
private List<List<AceToken>> mSentences;
/** The raw byte document, no preprocessing */
private String mRawBuffer;
static Logger mLog = Logger.getLogger(AceReader.class.getName());
public AceDocument(String id) {
super(id);
mEntities = Generics.newHashMap();
mEntityMentions = Generics.newHashMap();
mSentenceEntityMentions = new ArrayList<>();
mRelations = Generics.newHashMap();
mRelationMentions = Generics.newHashMap();
mSentenceRelationMentions = new ArrayList<>();
mEvents = Generics.newHashMap();
mEventMentions = Generics.newHashMap();
mSentenceEventMentions = new ArrayList<>();
mTokens = new Vector<>();
}
public void setPrefix(String p) {
mPrefix = p;
setSource(mPrefix);
}
public String getPrefix() {
return mPrefix;
}
public void setSource(String p) {
if (p.indexOf("bc/") >= 0)
mSource = "broadcast conversation";
else if (p.indexOf("bn/") >= 0)
mSource = "broadcast news";
else if (p.indexOf("cts/") >= 0)
mSource = "telephone";
else if (p.indexOf("nw/") >= 0)
mSource = "newswire";
else if (p.indexOf("un/") >= 0)
mSource = "usenet";
else if (p.indexOf("wl/") >= 0)
mSource = "weblog";
else {
log.info("WARNING: Unknown source for doc: " + p);
mSource = "none";
}
}
public int getSentenceCount() {
return mSentenceEntityMentions.size();
}
public ArrayList<AceEntityMention> getEntityMentions(int sent) {
return mSentenceEntityMentions.get(sent);
}
public ArrayList<ArrayList<AceEntityMention>> getAllEntityMentions() {
return mSentenceEntityMentions;
}
public ArrayList<AceRelationMention> getRelationMentions(int sent) {
return mSentenceRelationMentions.get(sent);
}
public ArrayList<ArrayList<AceRelationMention>> getAllRelationMentions() {
return mSentenceRelationMentions;
}
public ArrayList<AceEventMention> getEventMentions(int sent) {
return mSentenceEventMentions.get(sent);
}
public ArrayList<ArrayList<AceEventMention>> getAllEventMentions() {
return mSentenceEventMentions;
}
public AceEntity getEntity(String id) {
return mEntities.get(id);
}
public Set<String> getKeySetEntities() {
return mEntities.keySet();
}
public void addEntity(AceEntity e) {
mEntities.put(e.getId(), e);
}
public Map<String, AceEntityMention> getEntityMentions() {
return mEntityMentions;
}
public AceEntityMention getEntityMention(String id) {
return mEntityMentions.get(id);
}
public void addEntityMention(AceEntityMention em) {
mEntityMentions.put(em.getId(), em);
}
public AceRelation getRelation(String id) {
return mRelations.get(id);
}
public void addRelation(AceRelation r) {
mRelations.put(r.getId(), r);
}
public Map<String, AceRelationMention> getRelationMentions() {
return mRelationMentions;
}
public AceRelationMention getRelationMention(String id) {
return mRelationMentions.get(id);
}
public void addRelationMention(AceRelationMention e) {
mRelationMentions.put(e.getId(), e);
}
public AceEvent getEvent(String id) {
return mEvents.get(id);
}
public void addEvent(AceEvent r) {
mEvents.put(r.getId(), r);
}
public Map<String, AceEventMention> getEventMentions() {
return mEventMentions;
}
public AceEventMention getEventMention(String id) {
return mEventMentions.get(id);
}
public void addEventMention(AceEventMention e) {
mEventMentions.put(e.getId(), e);
}
public void addToken(AceToken t) {
mTokens.add(t);
}
public int getTokenCount() {
return mTokens.size();
}
public AceToken getToken(int i) {
return mTokens.get(i);
}
public List<AceToken> getSentence(int index) {
return mSentences.get(index);
}
public List<List<AceToken>> getSentences() {
return mSentences;
}
public void setSentences(List<List<AceToken>> sentences) {
mSentences = sentences;
}
public String toString() {
return toXml(0);
}
public String toXml(int offset) {
StringBuffer buffer = new StringBuffer();
appendOffset(buffer, offset);
buffer.append("<?xml version=\"1.0\"?>\n");
appendOffset(buffer, offset);
buffer.append("<!DOCTYPE source_file SYSTEM \"apf.v5.1.2.dtd\">\n");
appendOffset(buffer, offset);
buffer.append("<source_file URI=\"" + mId + ".sgm\" SOURCE=\"" + mSource
+ "\" TYPE=\"text\" AUTHOR=\"LDC\" ENCODING=\"UTF-8\">\n");
appendOffset(buffer, offset);
buffer.append("<document DOCID=\"" + getId() + "\">\n");
// display all entities
Set<String> entKeys = mEntities.keySet();
for (String key : entKeys) {
AceEntity e = mEntities.get(key);
buffer.append(e.toXml(offset));
buffer.append("\n");
}
// display all relations
Set<String> relKeys = mRelations.keySet();
for (String key : relKeys) {
AceRelation r = mRelations.get(key);
if (!r.getType().equals(AceRelation.NIL_LABEL)) {
buffer.append(r.toXml(offset));
buffer.append("\n");
}
}
// TODO: display all events
appendOffset(buffer, offset);
buffer.append("</document>\n");
appendOffset(buffer, offset);
buffer.append("</source_file>\n");
return buffer.toString();
}
private String tokensWithByteSpan(int start, int end) {
StringBuffer buf = new StringBuffer();
boolean doPrint = false;
buf.append("...");
for (AceToken mToken : mTokens) {
// start printing
if (doPrint == false && mToken.getByteOffset().start() > start - 20
&& mToken.getByteOffset().end() < end) {
doPrint = true;
}
// end printing
else if (doPrint == true && mToken.getByteOffset().start() > end + 20) {
doPrint = false;
}
if (doPrint) {
buf.append(" " + mToken.display());
}
}
buf.append("...");
return buf.toString();
}
/**
* Matches all relevant mentions, i.e. entities and anchors, to tokens Note:
* entity mentions may match with multiple tokens!
*/
public void matchCharSeqs(String filePrefix) {
//
// match the head and extent of entity mentions
//
Set<String> keys = mEntityMentions.keySet();
for (String key : keys) {
AceEntityMention m = mEntityMentions.get(key);
//
// match the head charseq to 1+ phrase(s)
//
try {
m.getHead().match(mTokens);
} catch (MatchException e) {
mLog.severe("READER ERROR: Failed to match entity mention head: " + "[" + m.getHead().getText() + ", "
+ m.getHead().getByteStart() + ", " + m.getHead().getByteEnd() + "]");
mLog.severe("Document tokens: " + tokensWithByteSpan(m.getHead().getByteStart(), m.getHead().getByteEnd()));
mLog.severe("Document prefix: " + filePrefix);
System.exit(1);
}
//
// match the extent charseq to 1+ phrase(s)
//
try {
m.getExtent().match(mTokens);
} catch (MatchException e) {
mLog.severe("READER ERROR: Failed to match entity mention extent: " + "[" + m.getExtent().getText() + ", "
+ m.getExtent().getByteStart() + ", " + m.getExtent().getByteEnd() + "]");
mLog.severe("Document tokens: " + tokensWithByteSpan(m.getExtent().getByteStart(), m.getExtent().getByteEnd()));
System.exit(1);
}
//
// set the head word of the mention
//
m.detectHeadToken(this);
}
// we need to do this for events as well since they may not have any AceEntityMentions associated with them (if they have no arguments)
Set<String> eventKeys = mEventMentions.keySet();
for (String key : eventKeys) {
AceEventMention m = mEventMentions.get(key);
//
// match the extent charseq to 1+ phrase(s)
//
try {
m.getExtent().match(mTokens);
} catch (MatchException e) {
mLog.severe("READER ERROR: Failed to match event mention extent: " + "[" + m.getExtent().getText() + ", "
+ m.getExtent().getByteStart() + ", " + m.getExtent().getByteEnd() + "]");
mLog.severe("Document tokens: " + tokensWithByteSpan(m.getExtent().getByteStart(), m.getExtent().getByteEnd()));
System.exit(1);
}
}
}
public static final String XML_EXT = ".apf.xml";
public static final String ORIG_EXT = ".sgm";
/**
* Parses an ACE document. Works in the following steps: (a) reads both the
* XML annotations; (b) reads the tokens; (c) matches the tokens against the
* annotations (d) constructs mSentenceEntityMentions and
* mRelationEntityMentions
*/
public static AceDocument parseDocument(String prefix, boolean usePredictedBoundaries) throws java.io.IOException,
org.xml.sax.SAXException, javax.xml.parsers.ParserConfigurationException {
mLog.fine("Reading document " + prefix);
AceDocument doc = null;
//
// read the ACE XML annotations
//
if (usePredictedBoundaries == false) {
doc = AceDomReader.parseDocument(new File(prefix + XML_EXT));
// log.info("Parsed " + doc.getEntityMentions().size() +
// " entities in document " + prefix);
}
//
// will use the predicted entity boundaries (see below)
//
else {
int lastSlash = prefix.lastIndexOf(File.separator);
assert (lastSlash > 0 && lastSlash < prefix.length() - 1);
String id = prefix.substring(lastSlash + 1);
// log.info(id + ": " + prefix);
doc = new AceDocument(id);
}
doc.setPrefix(prefix);
//
// read the raw byte stream
//
String trueCasedFileName = prefix + ORIG_EXT + ".truecase";
if((new File(trueCasedFileName).exists())){
mLog.severe("Using truecased file: " + trueCasedFileName);
doc.readRawBytes(trueCasedFileName);
} else {
doc.readRawBytes(prefix + ORIG_EXT);
}
//
// read the AceTokens
//
int offsetToSubtract = 0;
List<List<AceToken>> sentences = AceSentenceSegmenter.tokenizeAndSegmentSentences(prefix);
doc.setSentences(sentences);
for (List<AceToken> sentence : sentences) {
for (AceToken token : sentence) {
offsetToSubtract = token.adjustPhrasePositions(offsetToSubtract, token.getLiteral());
doc.addToken(token);
}
}
//
// match char sequences to phrases
//
doc.matchCharSeqs(prefix);
//
// construct the mEntityMentions matrix
//
Set<String> entityKeys = doc.mEntityMentions.keySet();
int sentence;
for (String key : entityKeys) {
AceEntityMention em = doc.mEntityMentions.get(key);
sentence = doc.mTokens.get(em.getHead().getTokenStart()).getSentence();
// adjust the number of rows if necessary
while (sentence >= doc.mSentenceEntityMentions.size()) {
doc.mSentenceEntityMentions.add(new ArrayList<>());
doc.mSentenceRelationMentions.add(new ArrayList<>());
doc.mSentenceEventMentions.add(new ArrayList<>());
}
// store the entity mentions in increasing order:
// (a) of the start position of their head
// (b) if start is the same, in increasing order of the head end
ArrayList<AceEntityMention> sentEnts = doc.mSentenceEntityMentions.get(sentence);
boolean added = false;
for (int i = 0; i < sentEnts.size(); i++) {
AceEntityMention crt = sentEnts.get(i);
if ((crt.getHead().getTokenStart() > em.getHead().getTokenStart())
|| (crt.getHead().getTokenStart() == em.getHead().getTokenStart() && crt.getHead().getTokenEnd() > em
.getHead().getTokenEnd())) {
sentEnts.add(i, em);
added = true;
break;
}
}
if (!added) {
sentEnts.add(em);
}
}
//
// construct the mRelationMentions matrix
//
Set<String> relKeys = doc.mRelationMentions.keySet();
for (String key : relKeys) {
AceRelationMention rm = doc.mRelationMentions.get(key);
sentence = doc.mTokens.get(rm.getArg(0).getHead().getTokenStart()).getSentence();
//
// no need to adjust the number of rows: was done above
//
// store the relation mentions in increasing order
// (a) of the start position of their head, or
// (b) if start is the same, in increasing order of ends
ArrayList<AceRelationMention> sentRels = doc.mSentenceRelationMentions.get(sentence);
boolean added = false;
for (int i = 0; i < sentRels.size(); i++) {
AceRelationMention crt = sentRels.get(i);
if ((crt.getMinTokenStart() > rm.getMinTokenStart())
|| (crt.getMinTokenStart() == rm.getMinTokenStart() && crt.getMaxTokenEnd() > rm.getMaxTokenEnd())) {
sentRels.add(i, rm);
added = true;
break;
}
}
if (!added) {
sentRels.add(rm);
}
}
//
// construct the mEventMentions matrix
//
Set<String> eventKeys = doc.mEventMentions.keySet();
for (String key : eventKeys) {
AceEventMention em = doc.mEventMentions.get(key);
sentence = doc.mTokens.get(em.getMinTokenStart()).getSentence();
/*
* adjust the number of rows if necessary -- if you're wondering why we do
* this here again, (after we've done it for entities) it's because we can
* have an event with no entities near the end of the document and thus
* won't have created rows in mSentence*Mentions
*/
while (sentence >= doc.mSentenceEntityMentions.size()) {
doc.mSentenceEntityMentions.add(new ArrayList<>());
doc.mSentenceRelationMentions.add(new ArrayList<>());
doc.mSentenceEventMentions.add(new ArrayList<>());
}
// store the event mentions in increasing order
// (a) first, event mentions with no arguments
// (b) then by the start position of their head, or
// (c) if start is the same, in increasing order of ends
ArrayList<AceEventMention> sentEvents = doc.mSentenceEventMentions.get(sentence);
boolean added = false;
for (int i = 0; i < sentEvents.size(); i++) {
AceEventMention crt = sentEvents.get(i);
if ((crt.getMinTokenStart() > em.getMinTokenStart())
|| (crt.getMinTokenStart() == em.getMinTokenStart() && crt.getMaxTokenEnd() > em.getMaxTokenEnd())) {
sentEvents.add(i, em);
added = true;
break;
}
}
if (!added) {
sentEvents.add(em);
}
}
return doc;
}
//
// heeyoung : skip relation, event parsing part - for ACE2004
//
public static AceDocument parseDocument(String prefix, boolean usePredictedBoundaries, String AceVersion) throws java.io.IOException,
org.xml.sax.SAXException, javax.xml.parsers.ParserConfigurationException {
mLog.fine("Reading document " + prefix);
AceDocument doc = null;
//
// read the ACE XML annotations
//
if (usePredictedBoundaries == false) {
doc = AceDomReader.parseDocument(new File(prefix + XML_EXT));
// log.info("Parsed " + doc.getEntityMentions().size() +
// " entities in document " + prefix);
}
//
// will use the predicted entity boundaries (see below)
//
else {
int lastSlash = prefix.lastIndexOf(File.separator);
assert (lastSlash > 0 && lastSlash < prefix.length() - 1);
String id = prefix.substring(lastSlash + 1);
// log.info(id + ": " + prefix);
doc = new AceDocument(id);
}
doc.setPrefix(prefix);
//
// read the raw byte stream
//
String trueCasedFileName = prefix + ORIG_EXT + ".truecase";
if((new File(trueCasedFileName).exists())){
mLog.severe("Using truecased file: " + trueCasedFileName);
doc.readRawBytes(trueCasedFileName);
} else {
doc.readRawBytes(prefix + ORIG_EXT);
}
//
// read the AceTokens
//
int offsetToSubtract = 0;
List<List<AceToken>> sentences = AceSentenceSegmenter.tokenizeAndSegmentSentences(prefix);
doc.setSentences(sentences);
for (List<AceToken> sentence : sentences) {
for (AceToken token : sentence) {
offsetToSubtract = token.adjustPhrasePositions(offsetToSubtract, token.getLiteral());
doc.addToken(token);
}
}
//
// match char sequences to phrases
//
doc.matchCharSeqs(prefix);
//
// construct the mEntityMentions matrix
//
Set<String> entityKeys = doc.mEntityMentions.keySet();
int sentence;
for (String key : entityKeys) {
AceEntityMention em = doc.mEntityMentions.get(key);
sentence = doc.mTokens.get(em.getHead().getTokenStart()).getSentence();
// adjust the number of rows if necessary
while (sentence >= doc.mSentenceEntityMentions.size()) {
doc.mSentenceEntityMentions.add(new ArrayList<>());
doc.mSentenceRelationMentions.add(new ArrayList<>());
doc.mSentenceEventMentions.add(new ArrayList<>());
}
// store the entity mentions in increasing order:
// (a) of the start position of their head
// (b) if start is the same, in increasing order of the head end
ArrayList<AceEntityMention> sentEnts = doc.mSentenceEntityMentions.get(sentence);
boolean added = false;
for (int i = 0; i < sentEnts.size(); i++) {
AceEntityMention crt = sentEnts.get(i);
if ((crt.getHead().getTokenStart() > em.getHead().getTokenStart())
|| (crt.getHead().getTokenStart() == em.getHead().getTokenStart() && crt.getHead().getTokenEnd() > em
.getHead().getTokenEnd())) {
sentEnts.add(i, em);
added = true;
break;
}
}
if (!added) {
sentEnts.add(em);
}
}
return doc;
}
// TODO: never used?
public void constructSentenceRelationMentions() {
//
// construct the mRelationEntityMentions matrix
//
Set<String> relKeys = mRelationMentions.keySet();
for (String key : relKeys) {
AceRelationMention rm = mRelationMentions.get(key);
int sentence = mTokens.get(rm.getArg(0).getHead().getTokenStart()).getSentence();
//
// no need to adjust the number of rows: was done in parseDocument
//
// store the relation mentions in increasing order
// (a) of the start position of their head, or
// (b) if start is the same, in increasing order of ends
ArrayList<AceRelationMention> sentRels = mSentenceRelationMentions.get(sentence);
boolean added = false;
for (int i = 0; i < sentRels.size(); i++) {
AceRelationMention crt = sentRels.get(i);
if ((crt.getMinTokenStart() > rm.getMinTokenStart())
|| (crt.getMinTokenStart() == rm.getMinTokenStart() && crt.getMaxTokenEnd() > rm.getMaxTokenEnd())) {
sentRels.add(i, rm);
added = true;
break;
}
}
if (!added) {
sentRels.add(rm);
}
}
}
/**
* Verifies if the two tokens are part of the same chunk
*/
public boolean sameChunk(int left, int right) {
for (int i = right; i > left; i--) {
String chunk = AceToken.OTHERS.get(getToken(i).getChunk());
if (!chunk.startsWith("I-"))
return false;
String word = AceToken.WORDS.get(getToken(i).getWord());
if (word.equals(",") || word.equals("(") || word.equals("-"))
return false;
}
String leftChunk = AceToken.OTHERS.get(getToken(left).getChunk());
if (leftChunk.equals("O"))
return false;
return true;
}
public boolean isChunkHead(int pos) {
String next = AceToken.OTHERS.get(getToken(pos + 1).getChunk());
if (next.startsWith("I-"))
return false;
return true;
}
public int findChunkEnd(int pos) {
String crt = AceToken.OTHERS.get(getToken(pos).getChunk());
if (crt.equals("O"))
return pos;
for (pos = pos + 1; pos < getTokenCount(); pos++) {
crt = AceToken.OTHERS.get(getToken(pos).getChunk());
if (!crt.startsWith("I-"))
break;
}
return pos - 1;
}
public int findChunkStart(int pos) {
String crt = AceToken.OTHERS.get(getToken(pos).getChunk());
if (crt.equals("O") || crt.startsWith("B-"))
return pos;
for (pos = pos - 1; pos >= 0; pos--) {
crt = AceToken.OTHERS.get(getToken(pos).getChunk());
if (crt.startsWith("B-"))
break;
}
return pos;
}
public boolean isApposition(int left, int right) {
int leftEnd = findChunkEnd(left);
int rightStart = findChunkStart(right);
if (rightStart == leftEnd + 1)
return true;
if (rightStart == leftEnd + 2) {
String comma = AceToken.WORDS.get(getToken(leftEnd + 1).getWord());
if (comma.equals(",") || comma.equals("-") || comma.equals("_")) {
return true;
}
}
return false;
}
public int countVerbs(int start, int end) {
int count = 0;
for (int i = start; i < end; i++) {
String crt = AceToken.OTHERS.get(getToken(i).getPos());
if (crt.startsWith("VB"))
count++;
}
return count;
}
public int countCommas(int start, int end) {
int count = 0;
for (int i = start; i < end; i++) {
String crt = AceToken.WORDS.get(getToken(i).getWord());
if (crt.equals(","))
count++;
}
return count;
}
private void readRawBytes(String fileName) throws IOException {
BufferedReader in = new BufferedReader(new FileReader(fileName));
StringBuffer buf = new StringBuffer();
int c;
while ((c = in.read()) >= 0)
buf.append((char) c);
mRawBuffer = buf.toString();
// System.out.println(mRawBuffer);
in.close();
}
@SuppressWarnings("unused")
private void readPredictedEntityBoundaries(BufferedReader is) throws java.io.IOException {
// System.out.println("Reading boundaries from file: " + mPrefix);
//
// read Massi's B-ENT, I-ENT, or O labels
//
ArrayList<String> labels = new ArrayList<>();
String line;
while ((line = is.readLine()) != null) {
ArrayList<String> tokens = SimpleTokenize.tokenize(line);
if (tokens.isEmpty() == false)
labels.add(tokens.get(0));
}
assert (labels.size() == mTokens.size());
int entityId = 1;
//
// traverse the label array and create entities as needed
//
for (int i = 0; i < labels.size(); i++) {
// System.out.println(labels.get(i));
if (labels.get(i).startsWith("B-") || labels.get(i).startsWith("I-")) { // Massi's
// ents
// may
// start
// with
// I-ENT
int startToken = i;
int endToken = i + 1;
while (endToken < labels.size() && labels.get(endToken).startsWith("I-"))
endToken++;
//
// Set the type/subtype to whatever Massi predicted
// This is not directly used in this system. It is needed only
// to generate the APF files with Massi info, which are needed
// by Edgar. Otherwise type/subtype could be safely set to "none".
//
String label = labels.get(startToken);
int dash = label.indexOf("-", 2);
if (dash <= 2 || dash >= label.length()) {
throw new RuntimeException(label);
}
assert (dash > 2 && dash < label.length() - 1);
String type = label.substring(2, dash);
String subtype = label.substring(dash + 1);
/*
* String type = "none"; String subtype = "none";
*/
// create a new entity between [startToken, endToken)
makeEntity(startToken, endToken, entityId, type, subtype);
// skip over this entity
i = endToken - 1;
entityId++;
} else {
assert (labels.get(i).equals("O"));
}
}
}
public AceCharSeq makeCharSeq(int startToken, int endToken) {
/*
* StringBuffer buf = new StringBuffer(); for(int i = startToken; i <
* endToken; i ++){ if(i > startToken) buf.append(" ");
* buf.append(mTokens.get(i).getLiteral()); }
*/
startToken = Math.max(0, startToken);
while (mTokens.get(startToken).getByteStart() < 0)
// SGML token
startToken++;
endToken = Math.min(endToken, mTokens.size());
while (mTokens.get(endToken - 1).getByteStart() < 0)
// SGML token
endToken--;
assert (endToken > startToken);
String text = mRawBuffer.substring(mTokens.get(startToken).getRawByteStart(), mTokens.get(endToken - 1)
.getRawByteEnd());
/*
* if(mTokens.get(startToken).getByteStart() > mTokens.get(endToken -
* 1).getByteEnd() - 1){ for(int i = startToken; i < endToken; i ++){
* System.out.println("Token: " + mTokens.get(i).display()); } }
*/
return new AceCharSeq(text, // buf.toString(),
mTokens.get(startToken).getByteStart(), mTokens.get(endToken - 1).getByteEnd() - 1);
}
/** Makes an ACE entity from the span [startToken, endToken) */
private void makeEntity(int startToken, int endToken, int id, String type, String subtype) {
String eid = mId + "-E" + id;
AceEntity ent = new AceEntity(eid, type, subtype, "SPC");
addEntity(ent);
AceCharSeq cseq = makeCharSeq(startToken, endToken);
String emid = mId + "-E" + id + "-1";
AceEntityMention entm = new AceEntityMention(emid, "NOM", "NOM", cseq, cseq);
addEntityMention(entm);
ent.addMention(entm);
}
}