/*-* * Copyright © 2010-2015 Atilika Inc. and contributors (see CONTRIBUTORS.md) * * Licensed under the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. A copy of the * License is distributed with this work in the LICENSE.md file. You may * also obtain a copy of the License from * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.atilika.kuromoji; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import static org.junit.Assert.*; public class TestUtils { public static void assertTokenSurfacesEquals(List<String> expectedSurfaces, List<? extends TokenBase> actualTokens) { List<String> actualSurfaces = new ArrayList<>(); for (TokenBase token : actualTokens) { actualSurfaces.add(token.getSurface()); } assertEquals(expectedSurfaces, actualSurfaces); } public static void assertCanTokenizeStream(InputStream untokenizedInput, TokenizerBase tokenizer) throws IOException { BufferedReader untokenizedInputReader = new BufferedReader(new InputStreamReader(untokenizedInput, StandardCharsets.UTF_8)); String untokenizedLine; while ((untokenizedLine = untokenizedInputReader.readLine()) != null) { assertCanTokenizeString(untokenizedLine, tokenizer); } assertTrue(true); } public static void assertCanTokenizeString(String input, TokenizerBase tokenizer) { List<? extends TokenBase> tokens = tokenizer.tokenize(input); if (input.length() > 0) { assertFalse(tokens.isEmpty()); } else { assertTrue(tokens.isEmpty()); } } public static void assertTokenizedStreamEquals(InputStream tokenizedInput, InputStream untokenizedInput, TokenizerBase tokenizer) throws IOException { BufferedReader untokenizedInputReader = new BufferedReader(new InputStreamReader(untokenizedInput, StandardCharsets.UTF_8)); BufferedReader tokenizedInputReader = new BufferedReader(new InputStreamReader(tokenizedInput, StandardCharsets.UTF_8)); String untokenizedLine; while ((untokenizedLine = untokenizedInputReader.readLine()) != null) { List<? extends TokenBase> tokens = tokenizer.tokenize(untokenizedLine); for (TokenBase token : tokens) { String tokenLine = tokenizedInputReader.readLine(); assertNotNull(tokenLine); // TODO: Verify if this tab handling is correct... String[] parts = tokenLine.split("\\t", 2); String surface = parts[0]; String features = parts[1]; assertEquals(surface, token.getSurface()); assertEquals(features, token.getAllFeatures()); } } } public static void assertMultiThreadedTokenizedStreamEquals(int numThreads, final int perThreadRuns, final String tokenizedInputResource, final String untokenizedInputResource, final TokenizerBase tokenizer) throws IOException, InterruptedException { List<Thread> threads = new ArrayList<>(); for (int i = 0; i < numThreads; i++) { Thread thread = new Thread(new Runnable() { @Override public void run() { for (int run = 0; run < perThreadRuns; run++) { // System.out.println(Thread.currentThread().getName() + ": tokenizer run " + run); try { InputStream tokenizedInput = getClass().getResourceAsStream(tokenizedInputResource); InputStream untokenizedInput = getClass().getResourceAsStream(untokenizedInputResource); assertTokenizedStreamEquals(tokenizedInput, untokenizedInput, tokenizer); untokenizedInput.close(); tokenizedInput.close(); } catch (IOException e) { fail(e.getMessage()); } } } }); threads.add(thread); thread.start(); } for (Thread thread : threads) { thread.join(); } assertTrue(true); } public static void assertEqualTokenFeatureLengths(String text, TokenizerBase tokenizer) { List<? extends TokenBase> tokens = tokenizer.tokenize(text); Set<Integer> lengths = new HashSet<>(); for (TokenBase token : tokens) { lengths.add(token.getAllFeaturesArray().length); } assertEquals(1, lengths.size()); } }