package edu.stanford.nlp.sequences;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.util.AbstractIterator;
import java.util.*;
import java.util.regex.*;
import java.io.*;
/**
* DocumentReader for the original CoNLL 03 format. In this format, there is
* one word per line, with extra attributes of a word (POS tag, chunk, etc.) in
* other space or tab separated columns, where leading and trailing whitespace
* on the line are ignored. Sentences are supposedly
* separated by a blank line (one with no non-whitespace characters), but
* where blank lines occur is in practice often fairly random. In particular,
* sometimes entities span blank lines. Nevertheless, in this class, like in
* our original CoNLL system, these blank lines are preserved as a special
* BOUNDARY token and detected and exploited by some features. The text is
* divided into documents at each '-DOCSTART-' token, which is seen as a
* special token, which is also preserved. The reader can read data in any
* of the IOB/IOE/etc. formats and output tokens in any other, based on the
* entitySubclassification flag.
* <p>
* This reader is specifically for replicating CoNLL systems. For normal use,
* you should use the saner ColumnDocumentReaderAndWriter.
*
* @author Jenny Finkel
* @author Huy Nguyen
* @author Christopher Manning
*/
public class CoNLLDocumentReaderAndWriter implements DocumentReaderAndWriter<CoreLabel> {
private static final long serialVersionUID = 6281374154299530460L;
public static final String BOUNDARY = "*BOUNDARY*";
/** Historically, this reader used to treat the whole input as one document, but now it doesn't */
private static final boolean TREAT_FILE_AS_ONE_DOCUMENT = false;
private static final Pattern docPattern = Pattern.compile("^\\s*-DOCSTART-\\s");
private static final Pattern white = Pattern.compile("^\\s*$");
private SeqClassifierFlags flags; // = null;
@Override
public void init(SeqClassifierFlags flags) {
this.flags = flags;
}
@Override
public String toString() {
return "CoNLLDocumentReaderAndWriter[entitySubclassification: " +
flags.entitySubclassification + ", intern: " + flags.intern + ']';
}
@Override
public Iterator<List<CoreLabel>> getIterator(Reader r) {
return new CoNLLIterator(r);
}
private class CoNLLIterator extends AbstractIterator<List<CoreLabel>> {
public CoNLLIterator (Reader r) {
stringIter = splitIntoDocs(r);
}
@Override
public boolean hasNext() { return stringIter.hasNext(); }
@Override
public List<CoreLabel> next() { return processDocument(stringIter.next()); }
private Iterator<String> stringIter; // = null;
} // end class CoNLLIterator
private static Iterator<String> splitIntoDocs(Reader r) {
if (TREAT_FILE_AS_ONE_DOCUMENT) {
return Collections.singleton(IOUtils.slurpReader(r)).iterator();
} else {
Collection<String> docs = new ArrayList<>();
ObjectBank<String> ob = ObjectBank.getLineIterator(r);
StringBuilder current = new StringBuilder();
for (String line : ob) {
if (docPattern.matcher(line).lookingAt()) {
// Start new doc, store old one if non-empty
if (current.length() > 0) {
docs.add(current.toString());
current = new StringBuilder();
}
}
current.append(line);
current.append('\n');
}
if (current.length() > 0) {
docs.add(current.toString());
}
return docs.iterator();
}
}
private List<CoreLabel> processDocument(String doc) {
List<CoreLabel> list = new ArrayList<>();
String[] lines = doc.split("\n");
for (String line : lines) {
if ( ! flags.deleteBlankLines || ! white.matcher(line).matches()) {
list.add(makeCoreLabel(line));
}
}
IOBUtils.entitySubclassify(list, CoreAnnotations.AnswerAnnotation.class,
flags.backgroundSymbol, flags.entitySubclassification, flags.intern);
return list;
}
/** This deals with the CoNLL files for different languages which have
* between 2 and 5 columns on non-blank lines.
*
* @param line A line of CoNLL input
* @return The constructed token
*/
private CoreLabel makeCoreLabel(String line) {
CoreLabel wi = new CoreLabel();
// wi.line = line;
String[] bits = line.split("\\s+");
switch (bits.length) {
case 0:
case 1:
wi.setWord(BOUNDARY);
wi.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol);
break;
case 2:
wi.setWord(bits[0]);
wi.set(CoreAnnotations.AnswerAnnotation.class, bits[1]);
break;
case 3:
wi.setWord(bits[0]);
wi.setTag(bits[1]);
wi.set(CoreAnnotations.AnswerAnnotation.class, bits[2]);
break;
case 4:
wi.setWord(bits[0]);
wi.setTag(bits[1]);
wi.set(CoreAnnotations.ChunkAnnotation.class, bits[2]);
wi.set(CoreAnnotations.AnswerAnnotation.class, bits[3]);
break;
case 5:
if (flags.useLemmaAsWord) {
wi.setWord(bits[1]);
} else {
wi.setWord(bits[0]);
}
wi.set(CoreAnnotations.LemmaAnnotation.class, bits[1]);
wi.setTag(bits[2]);
wi.set(CoreAnnotations.ChunkAnnotation.class, bits[3]);
wi.set(CoreAnnotations.AnswerAnnotation.class, bits[4]);
break;
default:
throw new RuntimeIOException("Unexpected input (many fields): " + line);
}
//Value annotation is used in a lot of place in corenlp so setting here as the word itself
wi.set(CoreAnnotations.ValueAnnotation.class, wi.word());
// The copy to GoldAnswerAnnotation is done before the recoding is done, and so it preserves the original coding.
// This is important if the original coding is true, but the recoding is defective (like IOB2 to IO), since
// it will allow correct evaluation later.
wi.set(CoreAnnotations.GoldAnswerAnnotation.class, wi.get(CoreAnnotations.AnswerAnnotation.class));
return wi;
}
/** Return the coding scheme to IOB1 coding, regardless of what was used
* internally (unless retainEntitySubclassification is set).
* This is useful for scoring against CoNLL test output.
*
* @param tokens List of tokens in some NER encoding
*/
private void deEndify(List<CoreLabel> tokens) {
if (flags.retainEntitySubclassification) {
return;
}
IOBUtils.entitySubclassify(tokens, CoreAnnotations.AnswerAnnotation.class,
flags.backgroundSymbol, "iob1", flags.intern);
}
/** Write a standard CoNLL format output file.
*
* @param doc The document: A List of CoreLabel
* @param out Where to send the answers to
*/
@Override
@SuppressWarnings({"StringEquality", "StringContatenationInLoop"})
public void printAnswers(List<CoreLabel> doc, PrintWriter out) {
// boolean tagsMerged = flags.mergeTags;
// boolean useHead = flags.splitOnHead;
if ( ! "iob1".equalsIgnoreCase(flags.entitySubclassification)) {
deEndify(doc);
}
for (CoreLabel fl : doc) {
String word = fl.word();
if (word == BOUNDARY) { // Using == is okay, because it is set to constant
out.println();
} else {
String gold = fl.getString(CoreAnnotations.GoldAnswerAnnotation.class);
String guess = fl.get(CoreAnnotations.AnswerAnnotation.class);
// log.info(word + "\t" + gold + "\t" + guess));
String pos = fl.getString(CoreAnnotations.PartOfSpeechAnnotation.class);
String chunk = fl.getString(CoreAnnotations.ChunkAnnotation.class);
out.println(fl.word() + '\t' + pos + '\t' + chunk + '\t' +
gold + '\t' + guess);
}
}
}
private static StringBuilder maybeIncrementCounter(StringBuilder inProgressMisc, Counter<String> miscCounter) {
if (inProgressMisc.length() > 0) {
miscCounter.incrementCount(inProgressMisc.toString());
inProgressMisc = new StringBuilder();
}
return inProgressMisc;
}
/** Count some stats on what occurs in a file.
*/
public static void main(String[] args) throws IOException, ClassNotFoundException {
CoNLLDocumentReaderAndWriter rw = new CoNLLDocumentReaderAndWriter();
rw.init(new SeqClassifierFlags());
int numDocs = 0;
int numTokens = 0;
int numEntities = 0;
String lastAnsBase = "";
Counter<String> miscCounter = new ClassicCounter<>();
StringBuilder inProgressMisc = new StringBuilder();
for (Iterator<List<CoreLabel>> it = rw.getIterator(IOUtils.readerFromString(args[0])); it.hasNext(); ) {
List<CoreLabel> doc = it.next();
numDocs++;
for (CoreLabel fl : doc) {
String word = fl.word();
// System.out.println("FL " + (++i) + " was " + fl);
if (word.equals(BOUNDARY)) {
continue;
}
String ans = fl.get(CoreAnnotations.AnswerAnnotation.class);
String ansBase;
String ansPrefix;
String[] bits = ans.split("-");
if (bits.length == 1) {
ansBase = bits[0];
ansPrefix = "";
} else {
ansBase = bits[1];
ansPrefix = bits[0];
}
numTokens++;
if ( ! ansBase.equals("O")) {
if (ansBase.equals(lastAnsBase)) {
if (ansPrefix.equals("B")) {
numEntities++;
inProgressMisc = maybeIncrementCounter(inProgressMisc, miscCounter);
}
} else {
numEntities++;
inProgressMisc = maybeIncrementCounter(inProgressMisc, miscCounter);
}
if (ansBase.equals("MISC")) {
if (inProgressMisc.length() > 0) {
// already something there
inProgressMisc.append(' ');
}
inProgressMisc.append(word);
}
} else {
inProgressMisc = maybeIncrementCounter(inProgressMisc, miscCounter);
}
lastAnsBase = ansBase;
} // for tokens
} // for documents
System.out.println("File " + args[0] + " has " + numDocs + " documents, " +
numTokens + " (non-blank line) tokens and " +
numEntities + " entities.");
System.out.printf("Here are the %.0f MISC items with counts:%n", miscCounter.totalCount());
System.out.println(Counters.toVerticalString(miscCounter, "%.0f\t%s"));
} // end main
} // end class CoNLLDocumentReaderAndWriter