package edu.stanford.nlp.tagger.maxent;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.IOException;
import java.util.List;
import edu.stanford.nlp.io.PrintFile;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.tagger.io.TaggedFileRecord;
import edu.stanford.nlp.util.ConfusionMatrix;
import edu.stanford.nlp.util.concurrent.MulticoreWrapper;
import edu.stanford.nlp.util.concurrent.ThreadsafeProcessor;
/** Tags data and can handle either data with gold-standard tags (computing
* performance statistics) or unlabeled data.
*
* @author Kristina Toutanova
* @version 1.0
*/
// TODO: can we break this class up in some way? Perhaps we can
// spread some functionality into TestSentence and some into MaxentTagger
// TODO: at the very least, it doesn't seem to make sense to make it
// an object with state, rather than just some static methods
public class TestClassifier {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(TestClassifier.class);
private final TaggedFileRecord fileRecord;
private int numRight;
private int numWrong;
private int unknownWords;
private int numWrongUnknown;
private int numCorrectSentences;
private int numSentences;
private ConfusionMatrix<String> confusionMatrix;
// TODO: only one boolean here instead of 4? They all use the same
// debug status
private boolean writeUnknDict;
private boolean writeWords;
private boolean writeTopWords;
private boolean writeConfusionMatrix;
MaxentTagger maxentTagger;
TaggerConfig config;
String saveRoot;
public TestClassifier(MaxentTagger maxentTagger) throws IOException {
this(maxentTagger, maxentTagger.config.getFile());
}
public TestClassifier(MaxentTagger maxentTagger, String testFile) throws IOException {
this.maxentTagger = maxentTagger;
this.config = maxentTagger.config;
setDebug(config.getDebug());
fileRecord = TaggedFileRecord.createRecord(config, testFile);
saveRoot = config.getDebugPrefix();
if (saveRoot == null || saveRoot.equals("")) {
saveRoot = fileRecord.filename();
}
test();
if (writeConfusionMatrix) {
PrintFile pf = new PrintFile(saveRoot + ".confusion");
pf.print(confusionMatrix.toString());
pf.close();
}
}
private void processResults(TestSentence testS,
PrintFile wordsFile, PrintFile unknDictFile,
PrintFile topWordsFile, boolean verboseResults) {
numSentences++;
testS.writeTagsAndErrors(testS.finalTags, unknDictFile, verboseResults);
if (writeUnknDict) testS.printUnknown(numSentences, unknDictFile);
if (writeTopWords) testS.printTop(topWordsFile);
testS.updateConfusionMatrix(testS.finalTags, confusionMatrix);
numWrong = numWrong + testS.numWrong;
numRight = numRight + testS.numRight;
unknownWords = unknownWords + testS.numUnknown;
numWrongUnknown = numWrongUnknown + testS.numWrongUnknown;
if (testS.numWrong == 0) {
numCorrectSentences++;
}
if (verboseResults) {
log.info("Sentence number: " + numSentences + "; length " + (testS.size-1) +
"; correct: " + testS.numRight + "; wrong: " + testS.numWrong +
"; unknown wrong: " + testS.numWrongUnknown);
log.info(" Total tags correct: " + numRight + "; wrong: " + numWrong +
"; unknown wrong: " + numWrongUnknown);
}
}
/**
* Test on a file containing correct tags already. when init'ing from trees
* TODO: Add the ability to have a second transformer to transform output back; possibly combine this method
* with method below
*/
private void test()
throws IOException
{
numSentences = 0;
confusionMatrix = new ConfusionMatrix<>();
PrintFile pf = null;
PrintFile pf1 = null;
PrintFile pf3 = null;
if(writeWords) pf = new PrintFile(saveRoot + ".words");
if(writeUnknDict) pf1 = new PrintFile(saveRoot + ".un.dict");
if(writeTopWords) pf3 = new PrintFile(saveRoot + ".words.top");
boolean verboseResults = config.getVerboseResults();
if (config.getNThreads() != 1) {
MulticoreWrapper<List<TaggedWord>, TestSentence> wrapper = new MulticoreWrapper<>(config.getNThreads(), new TestSentenceProcessor(maxentTagger));
for (List<TaggedWord> taggedSentence : fileRecord.reader()) {
wrapper.put(taggedSentence);
while (wrapper.peek()) {
processResults(wrapper.poll(), pf, pf1, pf3, verboseResults);
}
}
wrapper.join();
while (wrapper.peek()) {
processResults(wrapper.poll(), pf, pf1, pf3, verboseResults);
}
} else{
for (List<TaggedWord> taggedSentence : fileRecord.reader()) {
TestSentence testS = new TestSentence(maxentTagger);
testS.setCorrectTags(taggedSentence);
testS.tagSentence(taggedSentence, false);
processResults(testS, pf, pf1, pf3, verboseResults);
}
}
if(pf != null) pf.close();
if(pf1 != null) pf1.close();
if(pf3 != null) pf3.close();
}
String resultsString(MaxentTagger maxentTagger) {
StringBuilder output = new StringBuilder();
output.append(String.format("Model %s has xSize=%d, ySize=%d, and numFeatures=%d.%n",
maxentTagger.config.getModel(),
maxentTagger.xSize,
maxentTagger.ySize,
maxentTagger.getLambdaSolve().lambda.length));
output.append(String.format("Results on %d sentences and %d words, of which %d were unknown.%n",
numSentences, numRight + numWrong, unknownWords));
output.append(String.format("Total sentences right: %d (%f%%); wrong: %d (%f%%).%n",
numCorrectSentences, numCorrectSentences * 100.0 / numSentences,
numSentences - numCorrectSentences,
(numSentences - numCorrectSentences) * 100.0 / (numSentences)));
output.append(String.format("Total tags right: %d (%f%%); wrong: %d (%f%%).%n",
numRight, numRight * 100.0 / (numRight + numWrong), numWrong,
numWrong * 100.0 / (numRight + numWrong)));
if (unknownWords > 0) {
output.append(String.format("Unknown words right: %d (%f%%); wrong: %d (%f%%).%n",
(unknownWords - numWrongUnknown),
100.0 - (numWrongUnknown * 100.0 / unknownWords),
numWrongUnknown, numWrongUnknown * 100.0 / unknownWords));
}
return output.toString();
}
void printModelAndAccuracy(MaxentTagger maxentTagger) {
// print the output all at once so that multiple threads don't clobber each other's output
log.info(resultsString(maxentTagger));
}
int getNumWords() {
return numRight + numWrong;
}
void setDebug(boolean status) {
writeUnknDict = status;
writeWords = status;
writeTopWords = status;
writeConfusionMatrix = status;
}
static class TestSentenceProcessor implements ThreadsafeProcessor<List<TaggedWord>, TestSentence> {
MaxentTagger maxentTagger;
public TestSentenceProcessor(MaxentTagger maxentTagger) {
this.maxentTagger = maxentTagger;
}
@Override
public TestSentence process(List<TaggedWord> taggedSentence) {
TestSentence testS = new TestSentence(maxentTagger);
testS.setCorrectTags(taggedSentence);
testS.tagSentence(taggedSentence, false);
return testS;
}
@Override
public ThreadsafeProcessor<List<TaggedWord>, TestSentence> newInstance() {
// MaxentTagger is threadsafe
return this;
}
}
}