package com.kennycason.kumo.nlp.tokenizer; import ch.lambdaj.Lambda; import org.apache.log4j.Logger; import org.junit.Test; import java.util.List; import static org.junit.Assert.assertEquals; /** * Created by kenny on 4/27/15. */ public class ChineseWordTokenizerTest { private static final Logger LOGGER = Logger.getLogger(ChineseWordTokenizerTest.class); @Test public void test() { final long time = System.currentTimeMillis(); final WordTokenizer parser = new ChineseWordTokenizer(); LOGGER.info("load time: " + (System.currentTimeMillis() - time) + " ms"); List<String> words = parser.tokenize("弹道导弹"); LOGGER.info(Lambda.join(words)); assertEquals(1, words.size()); words = parser.tokenize("美国人的文化.dog"); LOGGER.info(Lambda.join(words)); assertEquals(6, words.size()); words = parser.tokenize("我是美国人"); LOGGER.info(Lambda.join(words)); assertEquals(4, words.size()); words = parser.tokenize("政府依照法律行使执法权,如果超出法律赋予的权限范围,就是“滥用职权”;如果没有完全行使执法权,就是“不作为”。两者都是政府的错误。"); LOGGER.info(Lambda.join(words)); words = parser.tokenize("国家都有自己的政府。政府是税收的主体,可以实现福利的合理利用。"); LOGGER.info(Lambda.join(words)); } }