import org.junit.*; import static org.junit.Assert.*; import com.transmem.nlp.EnglishSegmenter; public class EnglishSegmenterTest { private class Sentence { public String sentence; public String[] tokens; public Sentence(String s, String[] t) { sentence = s; tokens = t; } } @Test public void testSegment() { EnglishSegmenter es = new EnglishSegmenter(); Sentence[] sents = new Sentence[]{ new Sentence("I'm a programmer.",new String[]{"I","'","m","a","programmer","."}), new Sentence(" A space before.",new String[]{"A","space","before","."}), new Sentence(" Spaces at ends. ",new String[]{"Spaces","at","ends","."}), new Sentence("Whitespaces\there\r\n",new String[]{"Whitespaces","here"}), new Sentence("#!random^marks%^&and(",new String[]{"#","!","random","^","marks","%","^","&","and","("}), new Sentence("hyphen-ated words",new String[]{"hyphen","-","ated","words"}), new Sentence("���Ĵ��� in it��",new String[]{"���Ĵ���","in","it","��"}) }; for (Sentence s: sents) { String[] tokens = es.segment(s.sentence); assertEquals("Token count should be equal for '"+s.sentence+"'",s.tokens.length,tokens.length); for (int i=0; i<tokens.length; i++) { assertEquals("Token "+i+" in '"+s.sentence+"'",s.tokens[i],tokens[i]); } } } }