package edu.stanford.nlp.ie.machinereading.domains.ace.reader;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.Vector;
import edu.stanford.nlp.trees.Span;
/**
* Implements the ACE {@literal <charseq>} construct.
*
* @author David McClosky
* @author Andrey Gusev
*/
public class AceCharSeq {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(AceCharSeq.class);
/** The exact text matched by this sequence */
private String mText;
/** Offset in the document stream */
private Span mByteOffset;
/** Span of tokens that match this char sequence */
private Span mTokenOffset;
/**
* Token that incorporates this whole char sequence, e.g.
* "George_Bush/NNP_NNP" for the text "George Bush" XXX: not used anymore
*/
// private AceToken mPhrase;
public AceCharSeq(String text, int start, int end) {
mText = text;
mByteOffset = new Span(start, end);
mTokenOffset = null;
// mPhrase = null;
}
public String toXml(String label, int offset) {
StringBuffer buffer = new StringBuffer();
AceElement.appendOffset(buffer, offset);
buffer.append('<').append(label).append(">\n");
AceElement.appendOffset(buffer, offset + 2);
buffer.append("<charseq START=\"").append(mByteOffset.start()).append("\" END=\"").append(mByteOffset.end()).append("\">");
buffer.append(mText).append("</charseq>");
buffer.append('\n');
AceElement.appendOffset(buffer, offset);
buffer.append("</").append(label).append('>');
return buffer.toString();
}
public String toXml(int offset) {
StringBuffer buffer = new StringBuffer();
AceElement.appendOffset(buffer, offset + 2);
buffer.append("<charseq START=\"").append(mByteOffset.start()).append("\" END=\"").append(mByteOffset.end()).append("\">");
buffer.append(mText).append("</charseq>");
return buffer.toString();
}
public String getText() {
return mText;
}
public int getByteStart() {
return mByteOffset.start();
}
public int getByteEnd() {
return mByteOffset.end();
}
public Span getByteOffset() {
return mByteOffset;
}
public int getTokenStart() {
if (mTokenOffset == null)
return -1;
return mTokenOffset.start();
}
public int getTokenEnd() {
if (mTokenOffset == null)
return -1;
return mTokenOffset.end();
}
public Span getTokenOffset() {
return mTokenOffset;
}
// public AceToken getPhrase() { return mPhrase; }
/**
* Matches this char seq against the full token stream As a result of this
* method mTokenOffset is initialized
*/
public void match(Vector<AceToken> tokens) throws MatchException {
int start = -1;
int end = -1;
for (int i = 0; i < tokens.size(); i++) {
//
// we found the starting token
//
if (tokens.get(i).getByteOffset().start() == mByteOffset.start()) {
start = i;
}
//
// we do not tokenize dashed-words, hence the start may be inside a token
// e.g. Saddam => pro-Saddam
// the same situation will happen due to (uncommon) annotation errors
//
else if (mByteOffset.start() > tokens.get(i).getByteOffset().start()
&& mByteOffset.start() < tokens.get(i).getByteOffset().end()) {
start = i;
}
//
// we found the ending token
// Note: ACE is inclusive for the end position, my tokenization is not
// in ACE: end position == position of last byte in token
// in .sgm.pre: end position == position of last byte + 1
//
if (tokens.get(i).getByteOffset().end() == mByteOffset.end() + 1) {
end = i;
break;
}
//
// we do not tokenize dashed-words, hence the end may be inside a token
// e.g. Conference => Conference-leading
// the same situation will happen due to (uncommon) annotation errors
//
else if (mByteOffset.end() >= tokens.get(i).getByteOffset().start()
&& mByteOffset.end() < tokens.get(i).getByteOffset().end() - 1) {
end = i;
break;
}
}
if (start >= 0 && end >= 0) {
mTokenOffset = new Span(start, end);
// mPhrase = makePhrase(tokens, mTokenOffset);
} else {
throw new MatchException("Match failed!");
}
}
@Override
public String toString() {
return "AceCharSeq [mByteOffset=" + mByteOffset + ", mText=" + mText
+ ", mTokenOffset=" + mTokenOffset + ']';
}
/*
* private AceToken makePhrase(Vector<AceToken> tokens, Span span) {
* StringBuffer word = new StringBuffer(); StringBuffer lemma = new
* StringBuffer(); StringBuffer pos = new StringBuffer(); StringBuffer chunk =
* new StringBuffer(); StringBuffer nerc = new StringBuffer();
*
* for(int i = span.mStart; i <= span.mEnd; i ++){ if(i > span.mStart){
* word.append("_"); lemma.append("_"); pos.append("_"); chunk.append("_");
* nerc.append("_"); }
*
* AceToken tok = tokens.get(i);
* word.append(AceToken.WORDS.get(tok.getWord()));
* lemma.append(AceToken.LEMMAS.get(tok.getLemma()));
* pos.append(AceToken.OTHERS.get(tok.getPos()));
* chunk.append(AceToken.OTHERS.get(tok.getChunk()));
* nerc.append(AceToken.OTHERS.get(tok.getNerc())); }
*
* AceToken phrase = new AceToken(word.toString(), lemma.toString(),
* pos.toString(), chunk.toString(), nerc.toString(), null, null, -1);
*
* //log.info("Constructed phrase: " + phrase.display()); return
* phrase; }
*/
}