/*
* Copyright Robert Newson
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.github.rnewson.couchdb.lucene.util;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.json.JSONObject;
import org.junit.Test;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.Matchers.containsString;
import static org.junit.Assert.assertThat;
public class AnalyzersTest {
@Test
public void testStandard() throws Exception {
assertThat(Analyzers.getAnalyzer("standard"), is(StandardAnalyzer.class));
}
@Test
public void testFrench() throws Exception {
assertThat(Analyzers.getAnalyzer("french"), is(FrenchAnalyzer.class));
}
@Test
public void testWhitespace() throws Exception {
assertThat(Analyzers.getAnalyzer("whitespace"), is(WhitespaceAnalyzer.class));
}
@Test
public void testPerField() throws Exception {
final Analyzer analyzer = Analyzers.getAnalyzer("perfield:{name:\"standard\",age:\"keyword\"}");
assertThat(analyzer, is(PerFieldAnalyzerWrapper.class));
assertThat(analyzer.toString(), containsString("default=org.apache.lucene.analysis.standard.StandardAnalyzer"));
assertThat(analyzer.toString(), containsString("name=org.apache.lucene.analysis.standard.StandardAnalyzer"));
assertThat(analyzer.toString(), containsString("age=org.apache.lucene.analysis.core.KeywordAnalyzer"));
}
@Test
public void testPerFieldDefault() throws Exception {
final Analyzer analyzer = Analyzers.getAnalyzer("perfield:{default:\"keyword\"}");
assertThat(analyzer, is(PerFieldAnalyzerWrapper.class));
assertThat(analyzer.toString(), containsString("default=org.apache.lucene.analysis.core.KeywordAnalyzer"));
}
@Test
public void testNGramInstance() throws Exception {
final Analyzer analyzer = Analyzers.getAnalyzer("ngram");
assertThat(analyzer.toString(), containsString("NGramAnalyzer"));
}
@Test
public void testClassInstance() throws Exception {
final JSONObject obj = new JSONObject("{ \"class\": \"org.apache.lucene.analysis.core.KeywordAnalyzer\" }");
final Analyzer analyzer = Analyzers.getAnalyzer(obj);
assertThat(analyzer, is(KeywordAnalyzer.class));
}
@Test
public void testClassInstance2() throws Exception {
final JSONObject obj = new JSONObject("{ \"class\": \"org.apache.lucene.analysis.nl.DutchAnalyzer\", \"params\": [] }");
final Analyzer analyzer = Analyzers.getAnalyzer(obj);
assertThat(analyzer, is(org.apache.lucene.analysis.nl.DutchAnalyzer.class));
}
@Test
public void testClassInstance3() throws Exception {
final JSONObject obj =
new JSONObject("{ \"class\": \"org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer\", \"params\": [ { \"name\": \"useDefaultStopWords\", \"type\": \"boolean\", \"value\": true } ] }");
final Analyzer analyzer = Analyzers.getAnalyzer(obj);
assertThat(analyzer, is(org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer.class));
}
@Test
public void testClassInstance4() throws Exception {
final JSONObject obj = new JSONObject("{ \"german\": {} }");
final Analyzer analyzer = Analyzers.getAnalyzer(obj);
assertThat(analyzer, is(org.apache.lucene.analysis.de.GermanAnalyzer.class));
}
@Test
public void testClassInstance5() throws Exception {
final JSONObject obj = new JSONObject("{ \"cjk\": \"\" }");
final Analyzer analyzer = Analyzers.getAnalyzer(obj);
assertThat(analyzer, is(org.apache.lucene.analysis.cjk.CJKAnalyzer.class));
}
@Test
public void testClassInstance6() throws Exception {
final JSONObject obj = new JSONObject("{ \"ngram\": { \"analyzer\": \"simple\", \"min\": 2, \"max\": 3 } }");
final Analyzer analyzer = Analyzers.getAnalyzer(obj);
assertThat(analyzer.toString(), containsString("NGramAnalyzer"));
}
@Test
public void testClassInstance7() throws Exception {
final Analyzer analyzer = Analyzers.getAnalyzer("perfield:{default:\"keyword\", lang_bo:{\"class\":\"org.apache.lucene.analysis.core.WhitespaceAnalyzer\"}, lang_sa:{\"class\":\"org.apache.lucene.analysis.hi.HindiAnalyzer\"}}");
assertThat(analyzer, is(PerFieldAnalyzerWrapper.class));
assertThat(analyzer.toString(), containsString("default=org.apache.lucene.analysis.core.KeywordAnalyzer"));
assertThat(analyzer.toString(), containsString("lang_bo=org.apache.lucene.analysis.core.WhitespaceAnalyzer"));
assertThat(analyzer.toString(), containsString("lang_sa=org.apache.lucene.analysis.hi.HindiAnalyzer"));
}
@Test
public void testClassInstance8() throws Exception {
final Analyzer analyzer = Analyzers.fromSpec("{\"perfield\":{\"default\": \"keyword\",\"lang_bo\": {\"class\": \"org.apache.lucene.analysis.core.WhitespaceAnalyzer\"},\"lang_sa\": {\"class\": \"org.apache.lucene.analysis.hi.HindiAnalyzer\"}}}");
assertThat(analyzer, is(PerFieldAnalyzerWrapper.class));
assertThat(analyzer.toString(), containsString("default=org.apache.lucene.analysis.core.KeywordAnalyzer"));
assertThat(analyzer.toString(), containsString("lang_bo=org.apache.lucene.analysis.core.WhitespaceAnalyzer"));
assertThat(analyzer.toString(), containsString("lang_sa=org.apache.lucene.analysis.hi.HindiAnalyzer"));
}
@Test
public void testNGramTokens() throws Exception {
assertThat(analyze("ngram:{\"analyzer\":\"simple\"}", "hey there"), is(new String[]{"h", "he", "e", "ey", "y", "t", "th", "h", "he", "e", "er", "r", "re", "e"}));
}
@Test
public void testNGramMinMax() throws Exception {
assertThat(analyze("ngram:{\"analyzer\":\"simple\",\"min\":2,\"max\":3}", "hello there"), is(new String[]{"he", "hel", "el", "ell", "ll", "llo", "lo", "th", "the", "he", "her", "er", "ere", "re"}));
}
@Test
public void testEmailAddresses() throws Exception {
assertThat(analyze("standard", "foo@bar.com"), is(new String[]{"foo", "bar.com"}));
assertThat(analyze("classic", "foo@bar.com"), is(new String[]{"foo@bar.com"}));
}
private String[] analyze(final String analyzerName, final String text) throws Exception {
final Analyzer analyzer = Analyzers.getAnalyzer(analyzerName);
final TokenStream stream = analyzer.tokenStream("default", new StringReader(text));
stream.reset();
final List<String> result = new ArrayList<String>();
while (stream.incrementToken()) {
final CharTermAttribute c = stream.getAttribute(CharTermAttribute.class);
result.add(c.toString());
}
return result.toArray(new String[0]);
}
}