package edu.stanford.nlp.ie.regexp; import junit.framework.TestCase; import java.io.*; import java.util.*; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.SentenceUtils; import edu.stanford.nlp.util.StringUtils; /** * A simple test for the regex ner. Writes out a temporary file with * some patterns. It then reads in those patterns to a couple regex * ner classifiers, tests them on a couple sentences, and makes sure * it gets the expected results. * * @author John Bauer */ public class RegexNERSequenceClassifierTest extends TestCase { private static File tempFile = null; static final String[] words = { "My dog likes to eat sausage : turkey , pork , beef , etc .", "I went to Shoreline Park and saw an avocet and some curlews ( shorebirds ) ." }; static final String[] tags = { "PRP$ NN RB VBZ VBG NN : NN , NN , NN , FW .", "PRP VBD TO NNP NNP CC VBD DT NN CC DT NNS -LRB- NNP -RRB- ." }; static final String[] ner = { "O O O O O O O O O O O O O O O", "O O O LOCATION LOCATION O O O O O O O O O O O"}; static final String[] expectedUncased = { "- - - - - food - - - - - - - - -", "- - - park park - - - shorebird - - shorebird - - - -" }; static final String[] expectedCased = { "- - - - - food - - - - - - - - -", "- - - - - - - - shorebird - - shorebird - - - -" }; static final String[] nerPatterns = { "Shoreline Park\tPARK\n", "Shoreline Park\tPARK\tLOCATION\n", "Shoreline\tPARK\n", "Shoreline Park and\tPARK\tLOCATION\n", "My\tPOSS\nsausage \\:\tFOO\n", "My\tPOSS\nsausage :\tFOO\n", "My\tPOSS\n\\. \\.\tFOO\n", "\\.\tPERIOD\n", ".\tPERIOD\n", "\\(|\\)\tPAREN\n", }; static final String[][] expectedNER = { { "- - - - - - - - - - - - - - -", "- - - - - - - - - - - - - - - -" }, { "- - - - - - - - - - - - - - -", "- - - PARK PARK - - - - - - - - - - -" }, { "- - - - - - - - - - - - - - -", "- - - - - - - - - - - - - - - -" }, { "- - - - - - - - - - - - - - -", "- - - PARK PARK PARK - - - - - - - - - -" }, // not clear it should do this, but does, as it's only tokenwise compatibility { "POSS - - - - FOO FOO - - - - - - - -", "- - - - - - - - - - - - - - - -" }, { "POSS - - - - FOO FOO - - - - - - - -", "- - - - - - - - - - - - - - - -" }, { "POSS - - - - - - - - - - - - - -", "- - - - - - - - - - - - - - - -" }, { "- - - - - - - - - - - - - - PERIOD", "- - - - - - - - - - - - - - - PERIOD" }, { "- - - - - - PERIOD - PERIOD - PERIOD - PERIOD - PERIOD", "PERIOD - - - - - - - - - - - PERIOD - PERIOD PERIOD" }, { "- - - - - - - - - - - - - - -", "- - - - - - - - - - - - PAREN - PAREN -" }, }; public List<List<CoreLabel>> sentences; public List<List<CoreLabel>> NERsentences; @Override public void setUp() throws IOException { synchronized(RegexNERSequenceClassifierTest.class) { if (tempFile == null) { tempFile = File.createTempFile("regexnertest.patterns", "txt"); FileWriter fout = new FileWriter(tempFile); BufferedWriter bout = new BufferedWriter(fout); bout.write("sausage\tfood\n"); bout.write("(avocet|curlew)(s?)\tshorebird\n"); bout.write("shoreline park\tpark\n"); bout.flush(); fout.close(); } } sentences = new ArrayList<List<CoreLabel>>(); NERsentences = new ArrayList<List<CoreLabel>>(); assertEquals(words.length, tags.length); assertEquals(words.length, ner.length); for (int snum = 0; snum < words.length; ++snum) { String[] wordPieces = words[snum].split(" "); String[] tagPieces = tags[snum].split(" "); String[] nerPieces = ner[snum].split(" "); assertEquals(wordPieces.length, tagPieces.length); assertEquals("Input " + snum + " " + words[snum] + " of different length than " + ner[snum], wordPieces.length, nerPieces.length); List<CoreLabel> sentence = new ArrayList<CoreLabel>(); List<CoreLabel> NERsentence = new ArrayList<CoreLabel>(); for (int wnum = 0; wnum < wordPieces.length; ++wnum) { CoreLabel token = new CoreLabel(); token.setWord(wordPieces[wnum]); token.setTag(tagPieces[wnum]); sentence.add(token); CoreLabel NERtoken = new CoreLabel(); NERtoken.setWord(wordPieces[wnum]); NERtoken.setTag(tagPieces[wnum]); NERtoken.setNER(nerPieces[wnum]); NERsentence.add(NERtoken); } sentences.add(sentence); NERsentences.add(NERsentence); } } private static String listToString(List<CoreLabel> sentence) { StringBuilder sb = null; for (CoreLabel cl : sentence) { if (sb == null) { sb = new StringBuilder("["); } else { sb.append(", "); } sb.append(cl.toShortString()); } if (sb == null) { sb = new StringBuilder("["); } sb.append(']'); return sb.toString(); } private static List<CoreLabel> deepCopy(List<CoreLabel> in) { List<CoreLabel> cll = new ArrayList<CoreLabel>(in.size()); for (CoreLabel cl : in) { cll.add(new CoreLabel(cl)); } return cll; } private static void compareAnswers(String[] expected, List<CoreLabel> sentence) { assertEquals("Lengths different for " + StringUtils.join(expected) + " and " + SentenceUtils.listToString(sentence), expected.length, sentence.size()); String str = "Comparing " + Arrays.toString(expected) + " and " + listToString(sentence); for (int i = 0; i < expected.length; ++i) { if (expected[i].equals("-")) { assertEquals(str, null, sentence.get(i).get(CoreAnnotations.AnswerAnnotation.class)); } else { assertEquals(str, expected[i], sentence.get(i).get(CoreAnnotations.AnswerAnnotation.class)); } } } public void testUncased() { String tempFilename = tempFile.getPath(); RegexNERSequenceClassifier uncased = new RegexNERSequenceClassifier(tempFilename, true, false); assertEquals(sentences.size(), expectedUncased.length); for (int i = 0; i < sentences.size(); ++i) { List<CoreLabel> sentence = deepCopy(sentences.get(i)); uncased.classify(sentence); String[] answers = expectedUncased[i].split(" "); compareAnswers(answers, sentence); } } public void testCased() { String tempFilename = tempFile.getPath(); RegexNERSequenceClassifier cased = new RegexNERSequenceClassifier(tempFilename, false, false); assertEquals(sentences.size(), expectedCased.length); for (int i = 0; i < sentences.size(); ++i) { List<CoreLabel> sentence = deepCopy(sentences.get(i)); cased.classify(sentence); String[] answers = expectedCased[i].split(" "); compareAnswers(answers, sentence); } } public void testNEROverlaps() { assertEquals(nerPatterns.length, expectedNER.length); for (int k = 0; k < nerPatterns.length; k++) { BufferedReader r1 = new BufferedReader(new StringReader(nerPatterns[k])); RegexNERSequenceClassifier cased = new RegexNERSequenceClassifier(r1, false, false, null); assertEquals(NERsentences.size(), expectedNER[k].length); for (int i = 0; i < NERsentences.size(); ++i) { List<CoreLabel> sentence = deepCopy(NERsentences.get(i)); cased.classify(sentence); String[] answers = expectedNER[k][i].split(" "); compareAnswers(answers, sentence); } // System.err.println("Completed test " + k); } } }