EnglishSegmenterTest.java example

Explorer

transmem-master
- src
  - com
    - transmem
  - tmx
    - LoadTmx.java
- test
  - db
    - DBTest.java
  - java

import org.junit.*;
import static org.junit.Assert.*;

import com.transmem.nlp.EnglishSegmenter;

public class EnglishSegmenterTest
{
	private class Sentence
	{
		public String sentence;
		public String[] tokens;
		public Sentence(String s, String[] t)
		{
			sentence = s;
			tokens = t;
		}
	}

	@Test public void testSegment()
	{
		EnglishSegmenter es = new EnglishSegmenter();
		Sentence[] sents = new Sentence[]{
			new Sentence("I'm a programmer.",new String[]{"I","'","m","a","programmer","."}),
			new Sentence(" A space before.",new String[]{"A","space","before","."}),
			new Sentence(" Spaces at ends. ",new String[]{"Spaces","at","ends","."}),
			new Sentence("Whitespaces\there\r\n",new String[]{"Whitespaces","here"}),
			new Sentence("#!random^marks%^&and(",new String[]{"#","!","random","^","marks","%","^","&","and","("}),
			new Sentence("hyphen-ated words",new String[]{"hyphen","-","ated","words"}),
			new Sentence("���Ĵ��� in it��",new String[]{"���Ĵ���","in","it","��"})
		};
		for (Sentence s: sents)
		{
			String[] tokens = es.segment(s.sentence);
			assertEquals("Token count should be equal for '"+s.sentence+"'",s.tokens.length,tokens.length);
			for (int i=0; i<tokens.length; i++)
			{
				assertEquals("Token "+i+" in '"+s.sentence+"'",s.tokens[i],tokens[i]);
			}
		}
	}
}