package edu.stanford.nlp.process;
import java.util.*;
import junit.framework.TestCase;
/** @author Christopher Manning */
public class WordShapeClassifierTest extends TestCase {
private static String[] inputs = { "fabulous", "Jørgensen", "--",
"beta-carotene", "x-ray", "A.", "supercalifragilisticexpialadocious",
"58", "59,000", "NF-kappa", "Exxon-Mobil", "a", "A4",
"IFN-gamma-inducible", "PPARgamma", "NF-kappaB", "CBF1/RBP-Jkappa",
"", "It's", "A-4", "congrès", "3,35%", "6€", "}", "《", "0-9",
"四千", "五亿◯", "PQ", "الحرازي", "2008", "427891", "A.B.C.",
"22-34", "Ak47", "frEaKy", "美方称",
"alphabeta", "betaalpha", "betalpha", "alpha-beta", "beta-alpha",
"zalphabeta", "zbetaalpha", "zbetalpha", "zalpha-beta", "zbeta-alpha"
};
private static String[] chris1outputs = { "LOWERCASE", "CAPITALIZED", "SYMBOL",
"LOWERCASE-DASH", "LOWERCASE-DASH", "ACRONYM1", "LOWERCASE",
"CARDINAL13", "NUMBER", "CAPITALIZED-DASH", "CAPITALIZED-DASH", "LOWERCASE", "ALLCAPS-DIGIT",
"CAPITALIZED-DASH", "CAPITALIZED", "CAPITALIZED-DASH", "CAPITALIZED-DIGIT-DASH",
"SYMBOL", "CAPITALIZED", "ALLCAPS-DIGIT-DASH", "LOWERCASE", "SYMBOL-DIGIT", "SYMBOL-DIGIT", "SYMBOL", "SYMBOL", "DIGIT-DASH",
"LOWERCASE", "LOWERCASE", "ALLCAPS", "LOWERCASE", "CARDINAL4", "CARDINAL5PLUS", "ACRONYM",
"DIGIT-DASH", "CAPITALIZED-DIGIT", "MIXEDCASE", "LOWERCASE",
"LOWERCASE", "LOWERCASE", "LOWERCASE", "LOWERCASE-DASH", "LOWERCASE-DASH",
"LOWERCASE", "LOWERCASE", "LOWERCASE", "LOWERCASE-DASH", "LOWERCASE-DASH"
};
private static String[] chris2outputs = { "xxxxx", "Xxxxx", "--",
"g-xxx", "x-xxx", "X.", "xxxxx",
"dd", "dd,ddd", "XX-g", "Xx-Xxxx", "x", "Xd",
"XX-Xgxxx", "XXXg", "XX-gX", "XX-/Xdg",
"", "Xx'x", "X-d", "xxxxx", "d,dd%", "d€", "}", "《", "d-d",
"四千", "五亿◯", "XX", "الاحرزي", "dddd", "ddddd", "X..XX.",
"dd-dd", "Xxdd", "xxXxXx", "美方称",
"gg", "gg", "gxxx", "g-g", "g-g",
"xgg", "xgg", "xgxxx", "xg-g", "xg-g"
};
private static String[] chris2KnownLCoutputs = { "xxxxxk", "Xxxxx", "--",
"g-xxx", "x-xxx", "X.", "xxxxx",
"dd", "dd,ddd", "XX-g", "Xx-Xxxx", "xk", "Xd",
"XX-Xgxxx", "XXXg", "XX-gX", "XX-/Xdg",
"", "Xx'x", "X-d", "xxxxx", "d,dd%", "d€", "}", "《", "d-d",
"四千", "五亿◯", "XX", "الاحرزي", "dddd", "ddddd", "X..XX.",
"dd-dd", "Xxdd", "xxXxXx", "美方称",
"gg", "gg", "gxxx", "g-g", "g-g",
"xgg", "xgg", "xgxxx", "xg-g", "xg-g"
};
private static String[] chris3outputs = { "xxxx", "Xxxx", "--",
"g-xx", "x-xx", "X.", "xxxx",
"dd", "dd,dd", "XX-g", "Xx-xx", "x", "Xd",
"XX-gxx", "XXg", "XX-gX", "XX-/dg",
"", "Xx'x", "X-d", "xxxx", "d,d%", "d€", "}", "《", "d-d",
"四千", "五亿◯", "XX", "الحرزي", "dddd", "dddd", "X.X.",
"dd-dd", "Xxdd", "xxXx", "美方称",
"g", "g", "gxx", "g-", "g-",
"xg", "xg", "xgxx", "xg-", "xg-"
};
private static String[] chris3KnownLCoutputs = { "xxxxk", "Xxxx", "--",
"g-xx", "x-xx", "X.", "xxxx",
"dd", "dd,dd", "XX-g", "Xx-xx", "xk", "Xd",
"XX-gxx", "XXg", "XX-gX", "XX-/dg",
"", "Xx'x", "X-d", "xxxx", "d,d%", "d€", "}", "《", "d-d",
"四千", "五亿◯", "XX", "الحرزي", "dddd", "dddd", "X.X.",
"dd-dd", "Xxdd", "xxXx", "美方称",
"g", "g", "gxx", "g-", "g-",
"xg", "xg", "xgxx", "xg-", "xg-"
};
private static String[] chris4outputs = { "xxxxx", "Xxxxx", "--",
"g-xxx", "x-xxx", "X.", "xxxxx",
"dd", "dd.ddd", "XX-g", "Xx-Xxxx", "x", "Xd",
"XX-Xgxxx", "XXXg", "XX-gX", "XX-.Xdg",
"", "Xx'x", "X-d", "xxxxx", "d.dd%", "d$", ")", "(", "d-d",
"dd", "ddd", "XX", "ccccc", "dddd", "ddddd", "X..XX.",
"dd-dd", "Xxdd", "xxXxXx", "ccc",
"gg", "gg", "gxxx", "g-g", "g-g",
"xgg", "xgg", "xgxxx", "xg-g", "xg-g"
};
private static String[] chris4KnownLCoutputs = { "xxxxxk", "Xxxxx", "--",
"g-xxx", "x-xxx", "X.", "xxxxx",
"dd", "dd.ddd", "XX-g", "Xx-Xxxx", "xk", "Xd",
"XX-Xgxxx", "XXXg", "XX-gX", "XX-.Xdg",
"", "Xx'x", "X-d", "xxxxx", "d.dd%", "d$", ")", "(", "d-d",
"dd", "ddd", "XX", "ccccc", "dddd", "ddddd", "X..XX.",
"dd-dd", "Xxdd", "xxXxXx", "ccc",
"gg", "gg", "gxxx", "g-g", "g-g",
"xgg", "xgg", "xgxxx", "xg-g", "xg-g"
};
private static String[] digitsOutputs = { "fabulous", "Jørgensen", "--",
"beta-carotene", "x-ray", "A.", "supercalifragilisticexpialadocious",
"99", "99,999", "NF-kappa", "Exxon-Mobil", "a", "A9",
"IFN-gamma-inducible", "PPARgamma", "NF-kappaB", "CBF9/RBP-Jkappa",
"", "It's", "A-9", "congrès", "9,99%", "9€", "}", "《", "9-9",
"四千", "五亿◯", "PQ", "الحرازي", "9999", "999999", "A.B.C.",
"99-99", "Ak99", "frEaKy", "美方称",
"alphabeta", "betaalpha", "betalpha", "alpha-beta", "beta-alpha",
"zalphabeta", "zbetaalpha", "zbetalpha", "zalpha-beta", "zbeta-alpha"
};
private static String[] knownLC = { "house", "fabulous", "octopus", "a" };
public static void genericCheck(int wordshape, String[] in, String[] shape,
String[] knownLCWords) {
assertEquals("WordShapeClassifierTest is bung: array sizes differ",
in.length, shape.length);
Set<String> knownLCset = null;
if (knownLCWords != null) {
knownLCset = new HashSet<String>(Arrays.asList(knownLC));
}
for (int i = 0; i < in.length; i++) {
assertEquals("WordShape " + wordshape + " for " + in[i] + " with " + (knownLCset == null ? "null": "non-null") + " knownLCwords is not correct!", shape[i], WordShapeClassifier.wordShape(in[i], wordshape, knownLCset));
}
try {
WordShapeClassifier.wordShape(null, wordshape);
fail("WordShapeClassifier threw no exception on null");
} catch (NullPointerException npe) {
// this is the good answer
} catch (Exception e) {
fail("WordShapeClassifier didn't throw NullPointerException on null");
}
}
public static void outputResults(int wordshape, String[] in, String[] shape,
String[] knownLCWords) {
System.out.println("======================");
System.out.println(" Classifier " + wordshape);
System.out.println("======================");
Set<String> knownLCset = null;
if (knownLCWords != null) {
knownLCset = new HashSet<String>(Arrays.asList(knownLC));
}
for (int i = 0; i < in.length; ++i) {
String result =
WordShapeClassifier.wordShape(in[i], wordshape, knownLCset);
System.out.print(" " + in[i] + ": " + result);
if (i < shape.length) {
System.out.print(" (" + shape[i] + ")");
}
System.out.println();
}
}
public void testChris1() {
genericCheck(WordShapeClassifier.WORDSHAPECHRIS1, inputs, chris1outputs, null);
}
public void testChris2() {
genericCheck(WordShapeClassifier.WORDSHAPECHRIS2, inputs, chris2outputs, null);
genericCheck(WordShapeClassifier.WORDSHAPECHRIS2USELC, inputs, chris2KnownLCoutputs, knownLC);
}
public void testChris3() {
genericCheck(WordShapeClassifier.WORDSHAPECHRIS3, inputs, chris3outputs, null);
genericCheck(WordShapeClassifier.WORDSHAPECHRIS3USELC, inputs, chris3KnownLCoutputs, knownLC);
}
public void testChris4() {
genericCheck(WordShapeClassifier.WORDSHAPECHRIS4, inputs, chris4outputs, null);
genericCheck(WordShapeClassifier.WORDSHAPECHRIS4, inputs, chris4KnownLCoutputs, knownLC);
}
public void testDigits() {
genericCheck(WordShapeClassifier.WORDSHAPEDIGITS, inputs, digitsOutputs, null);
}
}