/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.index.sasi.analyzer; import java.io.InputStream; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import java.util.Locale; import org.junit.Test; import org.apache.cassandra.serializers.UTF8Serializer; import static org.junit.Assert.assertEquals; public class StandardAnalyzerTest { @Test public void testTokenizationAscii() throws Exception { InputStream is = StandardAnalyzerTest.class.getClassLoader() .getResourceAsStream("tokenization/apache_license_header.txt"); StandardTokenizerOptions options = new StandardTokenizerOptions.OptionsBuilder() .maxTokenLength(5).build(); StandardAnalyzer tokenizer = new StandardAnalyzer(); tokenizer.init(options); List<ByteBuffer> tokens = new ArrayList<>(); tokenizer.reset(is); while (tokenizer.hasNext()) tokens.add(tokenizer.next()); assertEquals(67, tokens.size()); } @Test public void testTokenizationLoremIpsum() throws Exception { InputStream is = StandardAnalyzerTest.class.getClassLoader() .getResourceAsStream("tokenization/lorem_ipsum.txt"); StandardAnalyzer tokenizer = new StandardAnalyzer(); tokenizer.init(StandardTokenizerOptions.getDefaultOptions()); List<ByteBuffer> tokens = new ArrayList<>(); tokenizer.reset(is); while (tokenizer.hasNext()) tokens.add(tokenizer.next()); assertEquals(62, tokens.size()); } @Test public void testTokenizationJaJp1() throws Exception { InputStream is = StandardAnalyzerTest.class.getClassLoader() .getResourceAsStream("tokenization/ja_jp_1.txt"); StandardAnalyzer tokenizer = new StandardAnalyzer(); tokenizer.init(StandardTokenizerOptions.getDefaultOptions()); tokenizer.reset(is); List<ByteBuffer> tokens = new ArrayList<>(); while (tokenizer.hasNext()) tokens.add(tokenizer.next()); assertEquals(210, tokens.size()); } @Test public void testTokenizationJaJp2() throws Exception { InputStream is = StandardAnalyzerTest.class.getClassLoader() .getResourceAsStream("tokenization/ja_jp_2.txt"); StandardTokenizerOptions options = new StandardTokenizerOptions.OptionsBuilder().stemTerms(true) .ignoreStopTerms(true).alwaysLowerCaseTerms(true).build(); StandardAnalyzer tokenizer = new StandardAnalyzer(); tokenizer.init(options); tokenizer.reset(is); List<ByteBuffer> tokens = new ArrayList<>(); while (tokenizer.hasNext()) tokens.add(tokenizer.next()); assertEquals(57, tokens.size()); } @Test public void testTokenizationRuRu1() throws Exception { InputStream is = StandardAnalyzerTest.class.getClassLoader() .getResourceAsStream("tokenization/ru_ru_1.txt"); StandardAnalyzer tokenizer = new StandardAnalyzer(); tokenizer.init(StandardTokenizerOptions.getDefaultOptions()); List<ByteBuffer> tokens = new ArrayList<>(); tokenizer.reset(is); while (tokenizer.hasNext()) tokens.add(tokenizer.next()); assertEquals(456, tokens.size()); } @Test public void testTokenizationZnTw1() throws Exception { InputStream is = StandardAnalyzerTest.class.getClassLoader() .getResourceAsStream("tokenization/zn_tw_1.txt"); StandardAnalyzer tokenizer = new StandardAnalyzer(); tokenizer.init(StandardTokenizerOptions.getDefaultOptions()); List<ByteBuffer> tokens = new ArrayList<>(); tokenizer.reset(is); while (tokenizer.hasNext()) tokens.add(tokenizer.next()); assertEquals(963, tokens.size()); } @Test public void testTokenizationAdventuresOfHuckFinn() throws Exception { InputStream is = StandardAnalyzerTest.class.getClassLoader() .getResourceAsStream("tokenization/adventures_of_huckleberry_finn_mark_twain.txt"); StandardTokenizerOptions options = new StandardTokenizerOptions.OptionsBuilder().stemTerms(true) .ignoreStopTerms(true).useLocale(Locale.ENGLISH) .alwaysLowerCaseTerms(true).build(); StandardAnalyzer tokenizer = new StandardAnalyzer(); tokenizer.init(options); List<ByteBuffer> tokens = new ArrayList<>(); tokenizer.reset(is); while (tokenizer.hasNext()) tokens.add(tokenizer.next()); assertEquals(37739, tokens.size()); } @Test public void testSkipStopWordBeforeStemmingFrench() throws Exception { InputStream is = StandardAnalyzerTest.class.getClassLoader() .getResourceAsStream("tokenization/french_skip_stop_words_before_stemming.txt"); StandardTokenizerOptions options = new StandardTokenizerOptions.OptionsBuilder().stemTerms(true) .ignoreStopTerms(true).useLocale(Locale.FRENCH) .alwaysLowerCaseTerms(true).build(); StandardAnalyzer tokenizer = new StandardAnalyzer(); tokenizer.init(options); List<ByteBuffer> tokens = new ArrayList<>(); List<String> words = new ArrayList<>(); tokenizer.reset(is); while (tokenizer.hasNext()) { final ByteBuffer nextToken = tokenizer.next(); tokens.add(nextToken); words.add(UTF8Serializer.instance.deserialize(nextToken.duplicate())); } assertEquals(4, tokens.size()); assertEquals("dans", words.get(0)); assertEquals("plui", words.get(1)); assertEquals("chanson", words.get(2)); assertEquals("connu", words.get(3)); } @Test public void tokenizeDomainNamesAndUrls() throws Exception { InputStream is = StandardAnalyzerTest.class.getClassLoader() .getResourceAsStream("tokenization/top_visited_domains.txt"); StandardAnalyzer tokenizer = new StandardAnalyzer(); tokenizer.init(StandardTokenizerOptions.getDefaultOptions()); tokenizer.reset(is); List<ByteBuffer> tokens = new ArrayList<>(); while (tokenizer.hasNext()) tokens.add(tokenizer.next()); assertEquals(15, tokens.size()); } @Test public void testReuseAndResetTokenizerInstance() throws Exception { List<ByteBuffer> bbToTokenize = new ArrayList<>(); bbToTokenize.add(ByteBuffer.wrap("Nip it in the bud".getBytes())); bbToTokenize.add(ByteBuffer.wrap("I couldn’t care less".getBytes())); bbToTokenize.add(ByteBuffer.wrap("One and the same".getBytes())); bbToTokenize.add(ByteBuffer.wrap("The squeaky wheel gets the grease.".getBytes())); bbToTokenize.add(ByteBuffer.wrap("The pen is mightier than the sword.".getBytes())); StandardAnalyzer tokenizer = new StandardAnalyzer(); tokenizer.init(StandardTokenizerOptions.getDefaultOptions()); List<ByteBuffer> tokens = new ArrayList<>(); for (ByteBuffer bb : bbToTokenize) { tokenizer.reset(bb); while (tokenizer.hasNext()) tokens.add(tokenizer.next()); } assertEquals(10, tokens.size()); } }