/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ package cc.mallet.extract.test; import junit.framework.Test; import junit.framework.TestCase; import junit.framework.TestSuite; import java.io.ByteArrayOutputStream; import java.io.PrintStream; import java.io.PrintWriter; import java.io.OutputStreamWriter; import cc.mallet.extract.*; import cc.mallet.pipe.*; import cc.mallet.pipe.iterator.ArrayIterator; import cc.mallet.types.Instance; import cc.mallet.types.InstanceList; import cc.mallet.types.LabelAlphabet; import cc.mallet.types.Sequence; import cc.mallet.util.CharSequenceLexer; /** * Created: Nov 18, 2004 * * @author <A HREF="mailto:casutton@cs.umass.edu>casutton@cs.umass.edu</A> * @version $Id: TestPerDocumentF1Evaluator.java,v 1.1 2007/10/22 21:38:02 mccallum Exp $ */ public class TestPerDocumentF1Evaluator extends TestCase { public TestPerDocumentF1Evaluator (String name) { super (name); } public static Test suite () { return new TestSuite (TestPerDocumentF1Evaluator.class); } private static String[] testPred = { "<eater>the big red fox</eater> did it", "it was done by <meal>the dog</meal>", "<eater>the cat</eater> ate the <meal>canary</meal>", "<meal>the hamburger</meal> was eaten by the kid", "<eater>the dog</eater> was eaten with zest", "four score and seven years <meal>ago</meal>" }; private static String[] testTrue = { "<eater>the big red fox</eater> did it", "it was done by <eater>the dog</eater>", "<eater>the cat</eater> ate <meal>the canary</meal>", "<meal>the hamburger</meal> was eaten by <eater>the kid</eater>", "<meal>the dog</meal> was eaten with zest", "four score and seven years ago" }; private Extraction createExtractionFrom (String[] predStrings, String[] trueStrings) { Pipe pipe = new SerialPipes (new Pipe[] { new SGML2TokenSequence (new CharSequenceLexer (CharSequenceLexer.LEX_NONWHITESPACE_CLASSES ), "O"), new Target2LabelSequence (), new PrintInputAndTarget (), }); InstanceList pred = new InstanceList (pipe); pred.addThruPipe (new ArrayIterator (predStrings)); InstanceList targets = new InstanceList (pipe); targets.addThruPipe (new ArrayIterator (trueStrings)); LabelAlphabet dict = (LabelAlphabet) pipe.getTargetAlphabet (); Extraction extraction = new Extraction (null, dict); for (int i = 0; i < pred.size(); i++) { Instance aPred = pred.get (i); Instance aTarget = targets.get (i); Tokenization input = (Tokenization) aPred.getData (); Sequence predSeq = (Sequence) aPred.getTarget (); Sequence targetSeq = (Sequence) aTarget.getTarget (); DocumentExtraction docextr = new DocumentExtraction ("TEST"+i, dict, input, predSeq, targetSeq, "O"); extraction.addDocumentExtraction (docextr); } return extraction; } private static final String testAExpected = "Testing per-document F1\nName\tP\tR\tF1\n" + "eater\t0.6667\t0.5\t0.5714\n" + "O\t0\t1\t0\n" + "meal\t0.25\t0.3333\t0.2857\n" + "OVERALL (micro-averaged) P=0.4286 R=0.4286 F1=0.4286\n" + "OVERALL (macro-averaged) F1=0.4286\n\n"; public void testPerDocEval () { Extraction extraction = createExtractionFrom (testPred, testTrue); PerDocumentF1Evaluator eval = new PerDocumentF1Evaluator (); ByteArrayOutputStream out = new ByteArrayOutputStream (); eval.setErrorOutputStream (System.out); eval.evaluate ("Testing", extraction, new PrintWriter (new OutputStreamWriter (out), true)); String output = out.toString (); assertEquals (testAExpected, output); } private static final String[] mpdPred = { "<title>Wizard of Oz</title> by <author>John Smith</author> and <author>Adam Felber</author>", "<title>Jisp Boo Fuzz by</title> the estimable <title>Rich Q. Doe</title> and <author>Frank Wilson</author>", "<title>Howdy Doody</title> if you think this is Mr. nonsense <author>don't you huh</author>", }; private static final String[] mpdTrue = { "<title>Wizard of Oz</title> by <author>John Smith</author> and <author>Adam Felber</author>", "<title>Jisp Boo Fuzz</title> by the estimable <author>Rich Q. Doe</author> and <author>Frank Wilson</author>", "<title>Howdy Doody</title> if <title>you</title> think this is <title>Mr.</title> <author> nonsense don't you huh</author>", }; private static final String mpdExpected = "Testing SEGMENT counts\nName\tCorrect\tPred\tTarget\n" + "title\t2\t4\t5\n" + "O\t0\t0\t0\n" + "author\t3\t4\t5\n" + "\nTesting per-field F1\n" + "Name\tP\tR\tF1\n" + "title\t0.5\t0.4\t0.4444\n" + "O\t0\t1\t0\n" + "author\t0.75\t0.6\t0.6667\n" + "OVERALL (micro-averaged) P=0.625 R=0.5 F1=0.5556\n" + "OVERALL (macro-averaged) F1=0.5556\n\n"; public void testPerFieldEval () { Extraction extraction = createExtractionFrom (mpdPred, mpdTrue); PerFieldF1Evaluator eval = new PerFieldF1Evaluator (); ByteArrayOutputStream out = new ByteArrayOutputStream (); eval.evaluate ("Testing", extraction, new PrintStream (out)); assertEquals (mpdExpected, out.toString()); } public void testToStdout () { Extraction extraction = createExtractionFrom (mpdPred, mpdTrue); PerFieldF1Evaluator eval = new PerFieldF1Evaluator (); eval.evaluate (extraction); System.out.println ("*** Please verify that something was output above."); } private static final String[] punctPred = { "<title>Wizard of Oz,</title> by <author>John Smith</author> and <author>Adam Felber</author>", "<title>Jisp Boo Fuzz by</title> the estimable <title>Rich Q. Doe</title> and <author>Frank Wilson</author>", "<title>Howdy Doody</title>!, if you think this is Mr. nonsense <author>don't you huh</author>", }; private static final String[] punctTrue = { "<title>Wizard of Oz</title>, by <author>John Smith</author> and <author>Adam Felber</author>", "<title>Jisp Boo Fuzz</title> by the estimable <author>Rich Q. Doe</author> and <author>Frank Wilson</author>", "<title>Howdy Doody!</title>, if <title>you</title> think this is <title>Mr.</title> <author> nonsense don't you huh</author>", }; //xxx Currently fails because grabbing the field span for Howdy Doody! grabs the </title> as // well. I think this is because getting the text subspan goes to the start of the next, // rather than the end of the last. It seems like that should be changed, but I'd need to // think about the ikmplications for Rexa before doing this. public void testPunctuationIgnoringEvaluator () { Extraction extraction = createExtractionFrom (punctPred, punctTrue); PerFieldF1Evaluator eval = new PerFieldF1Evaluator (); eval.setComparator (new PunctuationIgnoringComparator ()); eval.setErrorOutputStream (System.out); ByteArrayOutputStream out = new ByteArrayOutputStream (); eval.evaluate ("Testing", extraction, new PrintStream (out)); assertEquals (mpdExpected, out.toString()); } public void testFieldCleaning () { Extraction extraction = createExtractionFrom (punctPred, punctTrue); extraction.cleanFields (new RegexFieldCleaner ("<.*?>|,|!")); PerFieldF1Evaluator eval = new PerFieldF1Evaluator (); ByteArrayOutputStream out = new ByteArrayOutputStream (); eval.evaluate ("Testing", extraction, new PrintStream (out)); assertEquals (mpdExpected, out.toString()); } public static void main (String[] args) throws Throwable { TestSuite theSuite; if (args.length > 0) { theSuite = new TestSuite (); for (int i = 0; i < args.length; i++) { theSuite.addTest (new TestPerDocumentF1Evaluator (args[i])); } } else { theSuite = (TestSuite) suite (); } junit.textui.TestRunner.run (theSuite); } }