package org.deeplearning4j.text.tokenization.tokenizer; import org.deeplearning4j.text.tokenization.tokenizerfactory.JapaneseTokenizerFactory; import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; import org.junit.Test; import java.util.List; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; public class JapaneseTokenizerTest { private String toTokenize = "黒い瞳の綺麗な女の子"; private String[] expect = {"黒い", "瞳", "の", "綺麗", "な", "女の子"}; private String baseString = "驚いた彼は道を走っていった。"; @Test public void testJapaneseTokenizer() throws Exception { TokenizerFactory t = new JapaneseTokenizerFactory(); Tokenizer tokenizer = t.create(toTokenize); assertEquals(expect.length, tokenizer.countTokens()); for (int i = 0; i < tokenizer.countTokens(); ++i) { assertEquals(tokenizer.nextToken(), expect[i]); } } @Test public void testBaseForm() throws Exception { TokenizerFactory tf = new JapaneseTokenizerFactory(true); Tokenizer tokenizer1 = tf.create(toTokenize); Tokenizer tokenizer2 = tf.create(baseString); assertEquals("黒い", tokenizer1.nextToken()); assertEquals("驚く", tokenizer2.nextToken()); } @Test public void testGetTokens() throws Exception { TokenizerFactory tf = new JapaneseTokenizerFactory(); Tokenizer tokenizer = tf.create(toTokenize); // Exhaust iterator. assertEquals(expect.length, tokenizer.countTokens()); for (int i = 0; i < tokenizer.countTokens(); ++i) { assertEquals(tokenizer.nextToken(), expect[i]); } // Ensure exhausted. assertEquals(false, tokenizer.hasMoreTokens()); // Count doesn't change. assertEquals(expect.length, tokenizer.countTokens()); // getTokens still returns everything. List<String> tokens = tokenizer.getTokens(); assertEquals(expect.length, tokens.size()); } @Test public void testKuromojiMultithreading() throws Exception { class Worker implements Runnable { private final JapaneseTokenizerFactory tf; private final String[] jobs; private int runs; private boolean passed = false; public Worker(JapaneseTokenizerFactory tf, String[] jobs, int runs) { this.tf = tf; this.jobs = jobs; this.runs = runs; } @Override public void run() { while (runs > 0) { String s = jobs[runs-- % jobs.length]; List<String> tokens = tf.create(s).getTokens(); StringBuilder sb = new StringBuilder(); for (String token : tokens) { sb.append(token); } if (sb.toString().length() != s.length()) { return; } } passed = true; } } JapaneseTokenizerFactory tf = new JapaneseTokenizerFactory(); String[] work = {toTokenize, baseString, toTokenize, baseString}; Worker[] workers = new Worker[10]; for (int i = 0; i < workers.length; i++) { workers[i] = new Worker(tf, work, 50); } Thread[] threads = new Thread[10]; for (int i = 0; i < threads.length; i++) { threads[i] = new Thread(workers[i]); threads[i].start(); } for (Thread thread : threads) { thread.join(); } for (int i = 0; i < workers.length; i++) { assertTrue(workers[i].passed); } } }