package org.infinispan.query.analysis;
import static java.util.Arrays.asList;
import static org.testng.AssertJUnit.assertEquals;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.TermQuery;
import org.infinispan.configuration.cache.ConfigurationBuilder;
import org.infinispan.configuration.cache.Index;
import org.infinispan.manager.EmbeddedCacheManager;
import org.infinispan.query.Search;
import org.infinispan.query.SearchManager;
import org.infinispan.test.SingleCacheManagerTest;
import org.infinispan.test.fwk.TestCacheManagerFactory;
import org.testng.annotations.Test;
/**
* Copied and adapted from Hibernate Search
* org.hibernate.search.test.analyzer.solr.SolrAnalyzerTest
*
* @author Sanne Grinovero <sanne@hibernate.org> (C) 2012 Red Hat Inc.
* @author Emmanuel Bernard
* @author Hardy Ferentschik
*/
@Test(groups = "functional", testName = "query.analysis.AnalyzerTest")
public class AnalyzerTest extends SingleCacheManagerTest {
protected EmbeddedCacheManager createCacheManager() throws Exception {
ConfigurationBuilder cfg = getDefaultStandaloneCacheConfig(true);
cfg
.indexing()
.index(Index.ALL)
.addIndexedEntity(Team.class)
.addProperty("hibernate.search.default.directory_provider", "ram")
.addProperty("lucene_version", "LUCENE_CURRENT");
return TestCacheManagerFactory.createCacheManager(cfg);
}
/**
* Tests that the token filters applied to <code>Team</code> are successfully created and used. Refer to
* <code>Team</code> to see the exact definitions.
*
* @throws Exception in case the test fails
*/
public void testAnalyzerDef() throws Exception {
// create the test instance
Team team = new Team();
team.setDescription("This is a D\u00E0scription"); // \u00E0 == � - ISOLatin1AccentFilterFactory should strip of diacritic
team.setLocation("Atlanta");
team.setName("ATL team");
// persist and index the test object
cache.put("id", team);
SearchManager searchManager = Search.getSearchManager(cache);
// execute several search to show that the right tokenizers were applies
TermQuery query = new TermQuery(new Term("description", "D\u00E0scription"));
assertEquals(
"iso latin filter should work. � should be a now", 0, searchManager.getQuery(query).list().size()
);
query = new TermQuery(new Term("description", "is"));
assertEquals(
"stop word filter should work. is should be removed", 0, searchManager.getQuery(query).list().size()
);
query = new TermQuery(new Term("description", "dascript"));
assertEquals(
"snowball stemmer should work. 'dascription' should be stemmed to 'dascript'",
1,
searchManager.getQuery(query).list().size()
);
}
/**
* Tests the analyzers defined on {@link Team}.
*
* @throws Exception in case the test fails.
*/
public void testAnalyzers() throws Exception {
SearchManager search = Search.getSearchManager(cache);
Analyzer analyzer = search.getAnalyzer("standard_analyzer");
String text = "This is just FOOBAR's";
assertEquals(asList("This", "is", "just", "FOOBAR's"), terms(analyzer, "name", text));
analyzer = search.getAnalyzer("html_standard_analyzer");
text = "This is <b>foo</b><i>bar's</i>";
assertEquals(asList("This", "is", "foobar's"), terms(analyzer, "name", text));
analyzer = search.getAnalyzer("html_whitespace_analyzer");
text = "This is <b>foo</b><i>bar's</i>";
assertEquals(asList("This", "is", "foobar's"), terms(analyzer, "name", text));
analyzer = search.getAnalyzer("length_analyzer");
text = "ab abc abcd abcde abcdef";
assertEquals(asList("abc", "abcd", "abcde"), terms(analyzer, "name", text));
analyzer = search.getAnalyzer("length_analyzer");
text = "ab abc abcd abcde abcdef";
assertEquals(asList("abc", "abcd", "abcde"), terms(analyzer, "name", text));
analyzer = search.getAnalyzer("porter_analyzer");
text = "bikes bikes biking";
assertEquals(asList("bike", "bike", "bike"), terms(analyzer, "name", text));
analyzer = search.getAnalyzer("word_analyzer");
text = "CamelCase";
assertEquals(asList("Camel", "Case"), terms(analyzer, "name", text));
analyzer = search.getAnalyzer("synonym_analyzer");
text = "ipod cosmos";
assertEquals(asList("ipod", "i-pod", "cosmos", "universe"), terms(analyzer, "name", text));
analyzer = search.getAnalyzer("shingle_analyzer");
text = "please divide this sentence into shingles";
assertEquals(asList(
"please",
"please divide",
"divide",
"divide this",
"this",
"this sentence",
"sentence",
"sentence into",
"into",
"into shingles",
"shingles"), terms(analyzer, "name", text));
analyzer = search.getAnalyzer("pattern_analyzer");
text = "foo,bar";
assertEquals(asList("foo", "bar"), terms(analyzer, "name", text));
// CharStreamFactories test
analyzer = search.getAnalyzer("mapping_char_analyzer");
text = "CORA\u00C7\u00C3O DE MEL\u00C3O";
assertEquals(asList("CORACAO", "DE", "MELAO"), terms(analyzer, "name", text));
}
private List<String> terms(Analyzer analyzer, String fieldName, String text) throws IOException {
List<String> terms = new ArrayList<>();
TokenStream tokenStream = analyzer.tokenStream(fieldName, text);
tokenStream.addAttribute(CharTermAttribute.class);
CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
terms.add(attribute.toString());
}
tokenStream.close();
return terms;
}
protected Class<?>[] getAnnotatedClasses() {
return new Class[]{
Team.class
};
}
}