/*-*
* Copyright © 2010-2015 Atilika Inc. and contributors (see CONTRIBUTORS.md)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. A copy of the
* License is distributed with this work in the LICENSE.md file. You may
* also obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.atilika.kuromoji.ipadic;
import com.atilika.kuromoji.TokenizerBase.Mode;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;
import static org.junit.Assert.assertEquals;
public class SearchTokenizerTest {
private static Tokenizer tokenizer;
@BeforeClass
public static void beforeClass() throws Exception {
tokenizer = new Tokenizer.Builder().mode(Mode.SEARCH).build();
}
@Test
public void testCompoundSplitting() throws IOException {
assertSegmentation("/search-segmentation-tests.txt");
}
public void assertSegmentation(String testFilename) throws IOException {
LineNumberReader reader = new LineNumberReader(
new InputStreamReader(getResourceAsStream(testFilename), StandardCharsets.UTF_8));
String line;
while ((line = reader.readLine()) != null) {
// Remove comments
line = line.replaceAll("#.*$", "");
// Skip empty lines or comment lines
if (line.trim().isEmpty()) {
continue;
}
String[] fields = line.split("\t", 2);
String text = fields[0];
List<String> expectedSurfaces = Arrays.asList(fields[1].split("\\s+"));
assertSegmentation(text, expectedSurfaces);
}
}
public void assertSegmentation(String text, List<String> expectedSurfaces) {
List<Token> tokens = tokenizer.tokenize(text);
assertEquals("Input: " + text, expectedSurfaces.size(), tokens.size());
for (int i = 0; i < tokens.size(); i++) {
assertEquals(expectedSurfaces.get(i), tokens.get(i).getSurface());
}
}
private InputStream getResourceAsStream(String resource) {
return this.getClass().getResourceAsStream(resource);
}
}