package edu.stanford.nlp.sequences;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.ChunkAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.OriginalAnswerAnnotation;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.util.PaddedList;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.AbstractIterator;
import java.util.*;
import java.util.regex.*;
import java.io.*;
/**
* DocumentReader for CoNLL 03 format. In this format, there is one word
* per line, with extra attributes of a word (POS tag, chunk, etc.) in other
* space or tab separated columns, where leading and trailing whitespace on
* the line are ignored. Sentences are supposedly
* separated by a blank line (one with no non-whitespace characters), but
* where blank lines occur is in practice often fairly random. In particular,
* entities not infrequently span blank lines.
*
* @author Jenny Finkel
* @author Huy Nguyen
* @author Christopher Manning
*/
public class CoNLLDocumentReaderAndWriter implements DocumentReaderAndWriter {
private static final long serialVersionUID = 6281374154299530460L;
public static final String BOUNDARY = "*BOUNDARY*";
public static final String OTHER = "O";
private SeqClassifierFlags flags; // = null;
public void init(SeqClassifierFlags flags) {
this.flags = flags;
}
@Override
public String toString() {
return "CoNLLDocumentReaderAndWriter[entitySubclassification: " +
flags.entitySubclassification + ", intern: " + flags.intern + ']';
}
public Iterator<List<CoreLabel>> getIterator(Reader r) {
return new CoNLLIterator(r);
}
private class CoNLLIterator extends AbstractIterator<List<CoreLabel>> {
public CoNLLIterator (Reader r) {
stringIter = splitIntoDocs(r);
}
@Override
public boolean hasNext() { return stringIter.hasNext(); }
@Override
public List<CoreLabel> next() { return processDocument(stringIter.next()); }
private Iterator<String> stringIter; // = null;
}
private static Iterator<String> splitIntoDocs(Reader r) {
return Collections.singleton(StringUtils.slurpReader(r)).iterator();
}
private static Pattern white = Pattern.compile("^\\s*$");
private List<CoreLabel> processDocument(String doc) {
List<CoreLabel> lis = new ArrayList<CoreLabel>();
String[] lines = doc.split("\n");
for (String line : lines) {
if ( ! flags.deleteBlankLines || ! white.matcher(line).matches()) {
lis.add(makeCoreLabel(line));
}
}
entitySubclassify(lis, flags.entitySubclassification);
return lis;
}
/**
* This was used on the CoNLL data to map from a representation where
* normally entities were marked I-PERS, but the beginning of non-first
* items of an entity sequences were marked B-PERS (IOB1 representation).
* It changes this representation to other representations:
* a 4 way representation of all entities, like S-PERS, B-PERS,
* I-PERS, E-PERS for single word, beginning, internal, and end of entity
* (SBIEO); always marking the first word of an entity (IOB2);
* the reverse IOE1 and IOE2 and IO.
* This code is very specific to the particular CoNLL way of labeling
* classes. It will work on any of these styles of input, however, except
* for IO which necessarily loses information.
*/
private void entitySubclassify(List<CoreLabel> lineInfos,
String style) {
int how;
if ("iob1".equalsIgnoreCase(style)) {
how = 0;
} else if ("iob2".equalsIgnoreCase(style)) {
how = 1;
} else if ("ioe1".equalsIgnoreCase(style)) {
how = 2;
} else if ("ioe2".equalsIgnoreCase(style)) {
how = 3;
} else if ("io".equalsIgnoreCase(style)) {
how = 4;
} else if ("sbieo".equalsIgnoreCase(style)) {
how = 5;
} else {
System.err.println("entitySubclassify: unknown style: " + style);
how = 4;
}
lineInfos = new PaddedList<CoreLabel>(lineInfos, new CoreLabel());
int k = lineInfos.size();
String[] newAnswers = new String[k];
for (int i = 0; i < k; i++) {
final CoreLabel c = lineInfos.get(i);
final CoreLabel p = lineInfos.get(i - 1);
final CoreLabel n = lineInfos.get(i + 1);
final String cAns = c.get(AnswerAnnotation.class);
if (cAns.length() > 1 && cAns.charAt(1) == '-') {
String pAns = p.get(AnswerAnnotation.class);
if (pAns == null) { pAns = OTHER; }
String nAns = n.get(AnswerAnnotation.class);
if (nAns == null) { nAns = OTHER; }
final String base = cAns.substring(2, cAns.length());
String pBase = (pAns.length() > 2 ? pAns.substring(2, pAns.length()) : pAns);
String nBase = (nAns.length() > 2 ? nAns.substring(2, nAns.length()) : nAns);
char prefix = cAns.charAt(0);
char pPrefix = (pAns.length() > 0) ? pAns.charAt(0) : ' ';
char nPrefix = (nAns.length() > 0) ? nAns.charAt(0) : ' ';
boolean isStartAdjacentSame = base.equals(pBase) &&
(prefix == 'B' || prefix == 'S' || pPrefix == 'E' || pPrefix == 'S');
boolean isEndAdjacentSame = base.equals(nBase) &&
(prefix == 'E' || prefix == 'S' || nPrefix == 'B' || pPrefix == 'S');
boolean isFirst = (!base.equals(pBase)) || cAns.charAt(0) == 'B';
boolean isLast = (!base.equals(nBase)) || nAns.charAt(0) == 'B';
switch (how) {
case 0:
if (isStartAdjacentSame) {
newAnswers[i] = intern("B-" + base);
} else {
newAnswers[i] = intern("I-" + base);
}
break;
case 1:
if (isFirst) {
newAnswers[i] = intern("B-" + base);
} else {
newAnswers[i] = intern("I-" + base);
}
break;
case 2:
if (isEndAdjacentSame) {
newAnswers[i] = intern("E-" + base);
} else {
newAnswers[i] = intern("I-" + base);
}
break;
case 3:
if (isLast) {
newAnswers[i] = intern("E-" + base);
} else {
newAnswers[i] = intern("I-" + base);
}
break;
case 4:
newAnswers[i] = intern("I-" + base);
break;
case 5:
if (isFirst && isLast) {
newAnswers[i] = intern("S-" + base);
} else if ((!isFirst) && isLast) {
newAnswers[i] = intern("E-" + base);
} else if (isFirst && (!isLast)) {
newAnswers[i] = intern("B-" + base);
} else {
newAnswers[i] = intern("I-" + base);
}
}
} else {
newAnswers[i] = cAns;
}
}
for (int i = 0; i < k; i++) {
CoreLabel c = lineInfos.get(i);
c.set(AnswerAnnotation.class, newAnswers[i]);
}
}
private CoreLabel makeCoreLabel(String line) {
CoreLabel wi = new CoreLabel();
// wi.line = line;
String[] bits = line.split("\\s+");
switch (bits.length) {
case 0:
case 1:
wi.setWord(BOUNDARY);
wi.set(AnswerAnnotation.class, OTHER);
break;
case 2:
wi.setWord(bits[0]);
wi.set(AnswerAnnotation.class, bits[1]);
break;
case 3:
wi.setWord(bits[0]);
wi.setTag(bits[1]);
wi.set(AnswerAnnotation.class, bits[2]);
break;
case 4:
wi.setWord(bits[0]);
wi.setTag(bits[1]);
wi.set(ChunkAnnotation.class, bits[2]);
wi.set(AnswerAnnotation.class, bits[3]);
break;
case 5:
if (flags.useLemmaAsWord) {
wi.setWord(bits[1]);
} else {
wi.setWord(bits[0]);
}
wi.set(LemmaAnnotation.class, bits[1]);
wi.setTag(bits[2]);
wi.set(ChunkAnnotation.class, bits[3]);
wi.set(AnswerAnnotation.class, bits[4]);
break;
default:
throw new RuntimeIOException("Unexpected input (many fields): " + line);
}
wi.set(OriginalAnswerAnnotation.class, wi.get(AnswerAnnotation.class));
// This collapses things to do neither iob1 or iob2 but just IO. Remove?
// if (wi.get(AnswerAnnotation.class).length() > 1 && wi.get(AnswerAnnotation.class).charAt(1) == '-' && !flags.useFourWayEntitySubclassification) {
// wi.set(AnswerAnnotation.class, "I-" + wi.get(AnswerAnnotation.class).substring(2));
// }
return wi;
}
private String intern(String s) {
if (flags.intern) {
return s.intern();
} else {
return s;
}
}
/** Return the marking scheme to IOB1 marking, regardless of what it was.
* @param lineInfos List of tokens in some NER encoding
*/
private void deEndify(List<CoreLabel> lineInfos) {
if (flags.retainEntitySubclassification) {
return;
}
lineInfos = new PaddedList<CoreLabel>(lineInfos, new CoreLabel());
int k = lineInfos.size();
String[] newAnswers = new String[k];
for (int i = 0; i < k; i++) {
CoreLabel c = lineInfos.get(i);
CoreLabel p = lineInfos.get(i - 1);
if (c.get(AnswerAnnotation.class).length() > 1 && c.get(AnswerAnnotation.class).charAt(1) == '-') {
String base = c.get(AnswerAnnotation.class).substring(2);
String pBase = (p.get(AnswerAnnotation.class).length() <= 2 ? p.get(AnswerAnnotation.class) : p.get(AnswerAnnotation.class).substring(2));
boolean isSecond = (base.equals(pBase));
boolean isStart = (c.get(AnswerAnnotation.class).charAt(0) == 'B' || c.get(AnswerAnnotation.class).charAt(0) == 'S');
if (isSecond && isStart) {
newAnswers[i] = intern("B-" + base);
} else {
newAnswers[i] = intern("I-" + base);
}
} else {
newAnswers[i] = c.get(AnswerAnnotation.class);
}
}
for (int i = 0; i < k; i++) {
CoreLabel c = lineInfos.get(i);
c.set(AnswerAnnotation.class, newAnswers[i]);
}
}
/**
* @param doc The document: A List of CoreLabel
* @param out Where to send the answers to
*/
public void printAnswers(List<CoreLabel> doc, PrintWriter out) {
// boolean tagsMerged = flags.mergeTags;
// boolean useHead = flags.splitOnHead;
if ( ! "iob1".equalsIgnoreCase(flags.entitySubclassification)) {
deEndify(doc);
}
String prevGold = "";
String prevGuess = "";
for (CoreLabel fl : doc) {
String word = fl.word();
if (word == BOUNDARY) {
out.println();
} else {
String gold = fl.get(OriginalAnswerAnnotation.class);
if(gold == null) gold = "";
String guess = fl.get(AnswerAnnotation.class);
// System.err.println(fl.word() + "\t" + fl.goldget(AnswerAnnotation.class) + "\t" + fl.get(AnswerAnnotation.class));
if (false) {
// chris aug 2005
// this bit of code was here, and it appears like it would
// always mark the first of an entity sequence as B-, i.e.,
// IOB2, but CoNLL uses IOB1, which only marks with B- when two
// entities are adjacent, an annotation we just lose on.
// now just record unmucked with origAnswer so can't need to do this
if ( ! gold.equals(OTHER) && gold.length() >= 2) {
if ( ! gold.substring(2).equals(prevGold)) {
gold = "B-" + gold.substring(2);
}
prevGold = gold.substring(2);
}
if ( ! guess.equals(OTHER) && guess.length() >= 2) {
if ( ! guess.substring(2).equals(prevGuess)) {
guess = "B-" + guess.substring(2);
}
prevGuess = guess;
}
}
String pos = fl.tag();
String chunk = (fl.get(ChunkAnnotation.class) == null ? "" : fl.get(ChunkAnnotation.class));
out.println(fl.word() + '\t' + pos + '\t' + chunk + '\t' +
gold + '\t' + guess);
}
}
}
/** Count some stats on what occurs in a file.
*/
public static void main(String[] args) throws IOException, ClassNotFoundException {
// CoNLLDocumentReaderAndWriter f = new CoNLLDocumentReaderAndWriter();
// int numTokens = 0;
// int numEntities = 0;
// String lastAnsBase = "";
// List<CoreLabel> ll = f.processDocument(args[0]);
// for (CoreLabel fl : ll) {
// // System.out.println("FL " + (++i) + " was " + fl);
// if (fl.word().equals(BOUNDARY)) {
// continue;
// }
// String ans = fl.get(AnswerAnnotation.class);
// String ansBase;
// String ansPrefix;
// String[] bits = ans.split("-");
// if (bits.length == 1) {
// ansBase = bits[0];
// ansPrefix = "";
// } else {
// ansBase = bits[1];
// ansPrefix = bits[0];
// }
// numTokens++;
// if (ansBase.equals("O")) {
// } else if (ansBase.equals(lastAnsBase)) {
// if (ansPrefix.equals("B")) {
// numEntities++;
// }
// } else {
// numEntities++;
// }
// }
// System.out.println("File " + args[0] + " has " + numTokens +
// " tokens and " + numEntities + " entities.");
} // end main
} // end class CoNLLDocumentReaderAndWriter