package edu.stanford.nlp.tagger.maxent; import edu.stanford.nlp.util.logging.Redwood; import java.io.IOException; import java.util.List; import edu.stanford.nlp.io.PrintFile; import edu.stanford.nlp.ling.TaggedWord; import edu.stanford.nlp.tagger.io.TaggedFileRecord; import edu.stanford.nlp.util.ConfusionMatrix; import edu.stanford.nlp.util.concurrent.MulticoreWrapper; import edu.stanford.nlp.util.concurrent.ThreadsafeProcessor; /** Tags data and can handle either data with gold-standard tags (computing * performance statistics) or unlabeled data. * * @author Kristina Toutanova * @version 1.0 */ // TODO: can we break this class up in some way? Perhaps we can // spread some functionality into TestSentence and some into MaxentTagger // TODO: at the very least, it doesn't seem to make sense to make it // an object with state, rather than just some static methods public class TestClassifier { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(TestClassifier.class); private final TaggedFileRecord fileRecord; private int numRight; private int numWrong; private int unknownWords; private int numWrongUnknown; private int numCorrectSentences; private int numSentences; private ConfusionMatrix<String> confusionMatrix; // TODO: only one boolean here instead of 4? They all use the same // debug status private boolean writeUnknDict; private boolean writeWords; private boolean writeTopWords; private boolean writeConfusionMatrix; MaxentTagger maxentTagger; TaggerConfig config; String saveRoot; public TestClassifier(MaxentTagger maxentTagger) throws IOException { this(maxentTagger, maxentTagger.config.getFile()); } public TestClassifier(MaxentTagger maxentTagger, String testFile) throws IOException { this.maxentTagger = maxentTagger; this.config = maxentTagger.config; setDebug(config.getDebug()); fileRecord = TaggedFileRecord.createRecord(config, testFile); saveRoot = config.getDebugPrefix(); if (saveRoot == null || saveRoot.equals("")) { saveRoot = fileRecord.filename(); } test(); if (writeConfusionMatrix) { PrintFile pf = new PrintFile(saveRoot + ".confusion"); pf.print(confusionMatrix.toString()); pf.close(); } } private void processResults(TestSentence testS, PrintFile wordsFile, PrintFile unknDictFile, PrintFile topWordsFile, boolean verboseResults) { numSentences++; testS.writeTagsAndErrors(testS.finalTags, unknDictFile, verboseResults); if (writeUnknDict) testS.printUnknown(numSentences, unknDictFile); if (writeTopWords) testS.printTop(topWordsFile); testS.updateConfusionMatrix(testS.finalTags, confusionMatrix); numWrong = numWrong + testS.numWrong; numRight = numRight + testS.numRight; unknownWords = unknownWords + testS.numUnknown; numWrongUnknown = numWrongUnknown + testS.numWrongUnknown; if (testS.numWrong == 0) { numCorrectSentences++; } if (verboseResults) { log.info("Sentence number: " + numSentences + "; length " + (testS.size-1) + "; correct: " + testS.numRight + "; wrong: " + testS.numWrong + "; unknown wrong: " + testS.numWrongUnknown); log.info(" Total tags correct: " + numRight + "; wrong: " + numWrong + "; unknown wrong: " + numWrongUnknown); } } /** * Test on a file containing correct tags already. when init'ing from trees * TODO: Add the ability to have a second transformer to transform output back; possibly combine this method * with method below */ private void test() throws IOException { numSentences = 0; confusionMatrix = new ConfusionMatrix<>(); PrintFile pf = null; PrintFile pf1 = null; PrintFile pf3 = null; if(writeWords) pf = new PrintFile(saveRoot + ".words"); if(writeUnknDict) pf1 = new PrintFile(saveRoot + ".un.dict"); if(writeTopWords) pf3 = new PrintFile(saveRoot + ".words.top"); boolean verboseResults = config.getVerboseResults(); if (config.getNThreads() != 1) { MulticoreWrapper<List<TaggedWord>, TestSentence> wrapper = new MulticoreWrapper<>(config.getNThreads(), new TestSentenceProcessor(maxentTagger)); for (List<TaggedWord> taggedSentence : fileRecord.reader()) { wrapper.put(taggedSentence); while (wrapper.peek()) { processResults(wrapper.poll(), pf, pf1, pf3, verboseResults); } } wrapper.join(); while (wrapper.peek()) { processResults(wrapper.poll(), pf, pf1, pf3, verboseResults); } } else{ for (List<TaggedWord> taggedSentence : fileRecord.reader()) { TestSentence testS = new TestSentence(maxentTagger); testS.setCorrectTags(taggedSentence); testS.tagSentence(taggedSentence, false); processResults(testS, pf, pf1, pf3, verboseResults); } } if(pf != null) pf.close(); if(pf1 != null) pf1.close(); if(pf3 != null) pf3.close(); } String resultsString(MaxentTagger maxentTagger) { StringBuilder output = new StringBuilder(); output.append(String.format("Model %s has xSize=%d, ySize=%d, and numFeatures=%d.%n", maxentTagger.config.getModel(), maxentTagger.xSize, maxentTagger.ySize, maxentTagger.getLambdaSolve().lambda.length)); output.append(String.format("Results on %d sentences and %d words, of which %d were unknown.%n", numSentences, numRight + numWrong, unknownWords)); output.append(String.format("Total sentences right: %d (%f%%); wrong: %d (%f%%).%n", numCorrectSentences, numCorrectSentences * 100.0 / numSentences, numSentences - numCorrectSentences, (numSentences - numCorrectSentences) * 100.0 / (numSentences))); output.append(String.format("Total tags right: %d (%f%%); wrong: %d (%f%%).%n", numRight, numRight * 100.0 / (numRight + numWrong), numWrong, numWrong * 100.0 / (numRight + numWrong))); if (unknownWords > 0) { output.append(String.format("Unknown words right: %d (%f%%); wrong: %d (%f%%).%n", (unknownWords - numWrongUnknown), 100.0 - (numWrongUnknown * 100.0 / unknownWords), numWrongUnknown, numWrongUnknown * 100.0 / unknownWords)); } return output.toString(); } void printModelAndAccuracy(MaxentTagger maxentTagger) { // print the output all at once so that multiple threads don't clobber each other's output log.info(resultsString(maxentTagger)); } int getNumWords() { return numRight + numWrong; } void setDebug(boolean status) { writeUnknDict = status; writeWords = status; writeTopWords = status; writeConfusionMatrix = status; } static class TestSentenceProcessor implements ThreadsafeProcessor<List<TaggedWord>, TestSentence> { MaxentTagger maxentTagger; public TestSentenceProcessor(MaxentTagger maxentTagger) { this.maxentTagger = maxentTagger; } @Override public TestSentence process(List<TaggedWord> taggedSentence) { TestSentence testS = new TestSentence(maxentTagger); testS.setCorrectTags(taggedSentence); testS.tagSentence(taggedSentence, false); return testS; } @Override public ThreadsafeProcessor<List<TaggedWord>, TestSentence> newInstance() { // MaxentTagger is threadsafe return this; } } }