package edu.stanford.nlp.ie.crf; import edu.stanford.nlp.optimization.CmdEvaluator; import edu.stanford.nlp.sequences.DocumentReaderAndWriter; import edu.stanford.nlp.stats.MultiClassChunkEvalStats; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Pair; import java.io.*; import java.util.Collection; import java.util.List; /** * Evaluates CRFClassifier on a set of data * - called by QNMinimizer periodically * - If evalCmd is set, runs command line specified by evalCmd * otherwise does evaluation internally * NOTE: when running conlleval with exec on Linux, linux will first * fork process by duplicating memory of current process. So if * JVM has lots of memory, it will all be duplicated when * child process is initially forked. * @author Angel Chang */ public class CRFClassifierEvaluator<IN extends CoreMap> extends CmdEvaluator { private CRFClassifier<IN> classifier; private CRFLogConditionalObjectiveFunction func; // NOTE: Defalt uses -r, specify without -r if IOB private String cmdStr = "/u/nlp/bin/conlleval -r"; private String[] cmd; // TODO: Use data structure to hold data + features // Cache already featurized documents // Original object bank Collection<List<IN>> data; // Featurized data List<Pair<int[][][], int[]>> featurizedData; public CRFClassifierEvaluator(String description, CRFClassifier<IN> classifier, CRFLogConditionalObjectiveFunction func, Collection<List<IN>> data, List<Pair<int[][][], int[]>> featurizedData) { this.description = description; this.classifier = classifier; this.func = func; this.data = data; this.featurizedData = featurizedData; cmd = getCmd(cmdStr); } public CRFClassifierEvaluator(String description, CRFClassifier<IN> classifier) { this.description = description; this.classifier = classifier; } /** * Set helper function */ public void setHelperFunction(CRFLogConditionalObjectiveFunction func) { this.func = func; } /** * Set the data to test on */ public void setTestData(Collection<List<IN>> data, List<Pair<int[][][], int[]>> featurizedData) { this.data = data; this.featurizedData = featurizedData; } /** * Set the evaluation command (set to null to skip evaluation using command line) * @param evalCmd */ public void setEvalCmd(String evalCmd) { this.cmdStr = evalCmd; if (cmdStr != null) { cmdStr = cmdStr.trim(); if (cmdStr.length() == 0) { cmdStr = null; } } cmd = getCmd(cmdStr); } public void setValues(double[] x) { // TODO: Avoid this conversion of weights from 1D to 2D and usage of the // CRFLogConditionalObjectiveFunction // (unnecessary and expensive if weights are large vectors - like say 100 million) classifier.weights = func.to2D(x); } public String[] getCmd() { return cmd; } public void outputToCmd(OutputStream outputStream) { try { classifier.classifyAndWriteAnswers(data, featurizedData, outputStream, classifier.makeReaderAndWriter()); } catch (Exception ex) { throw new RuntimeException(ex); } } public double evaluate(double[] x) { double score = 0; setValues(x); if (getCmd() != null) { evaluateCmd(getCmd()); } else { try { // TODO: Classify in memory instead of writing to tmp file File f = File.createTempFile("CRFClassifierEvaluator","txt"); f.deleteOnExit(); OutputStream outputStream = new BufferedOutputStream(new FileOutputStream(f)); classifier.classifyAndWriteAnswers(data, featurizedData, outputStream, classifier.makeReaderAndWriter()); outputStream.close(); BufferedReader br = new BufferedReader(new FileReader(f)); MultiClassChunkEvalStats stats = new MultiClassChunkEvalStats("O"); score = stats.score(br, "\t"); System.err.println(stats.getConllEvalString()); f.delete(); } catch (Exception ex) { throw new RuntimeException(ex); } } return score; } }