AnalyzerBuilderTest.java example

Explorer
hibernate-search-master
/*
 * Hibernate Search, full-text search for your domain model
 *
 * License: GNU Lesser General Public License (LGPL), version 2.1 or later
 * See the lgpl.txt file in the root directory or <http://www.gnu.org/licenses/lgpl-2.1.html>.
 */
package org.hibernate.search.test.analyzer.definition;

import static org.hibernate.search.test.analyzer.AnalyzerTest.assertTokensEqual;
import static org.junit.Assert.assertEquals;

import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.TermQuery;
import org.hibernate.Transaction;
import org.hibernate.search.FullTextSession;
import org.hibernate.search.Search;
import org.hibernate.search.test.SearchTestBase;
import org.hibernate.search.testsupport.junit.SkipOnElasticsearch;
import org.hibernate.search.util.AnalyzerUtils;
import org.junit.Test;
import org.junit.experimental.categories.Category;

/**
 * Tests the analyzer creation framework.
 * This test might be affected by the version of the Analyzers being used.
 *
 * @author Emmanuel Bernard
 * @author Hardy Ferentschik
 */
public class AnalyzerBuilderTest extends SearchTestBase {

	/**
	 * Tests that the token filters applied to <code>Team</code> are successfully created and used. Refer to
	 * <code>Team</code> to see the exact definitions.
	 *
	 * @throws Exception in case the test fails
	 */
	@Test
	public void testAnalyzerDef() throws Exception {
		// create the test instance
		Team team = new Team();
		team.setDescription( "This is a D\u00E0scription" ); // \u00E0 == � - ISOLatin1AccentFilterFactory should strip of diacritic
		team.setLocation( "Atlanta" );
		team.setName( "ATL team" );

		// persist and index the test object
		FullTextSession fts = Search.getFullTextSession( openSession() );
		Transaction tx = fts.beginTransaction();
		fts.persist( team );
		tx.commit();
		fts.clear();

		// execute several search to show that the right tokenizers were applies
		tx = fts.beginTransaction();
		TermQuery query = new TermQuery( new Term( "description", "D\u00E0scription" ) );
		assertEquals(
				"iso latin filter should work. � should be a now", 0, fts.createFullTextQuery( query ).list().size()
		);

		query = new TermQuery( new Term( "description", "is" ) );
		assertEquals(
				"stop word filter should work. is should be removed", 0, fts.createFullTextQuery( query ).list().size()
		);

		query = new TermQuery( new Term( "description", "dascript" ) );
		assertEquals(
				"snowball stemmer should work. 'dascription' should be stemmed to 'dascript'",
				1,
				fts.createFullTextQuery( query ).list().size()
		);

		// cleanup
		fts.delete( fts.createFullTextQuery( query ).list().get( 0 ) );
		tx.commit();
		fts.close();
	}

	/**
	 * Tests the analyzers defined on {@link Team}.
	 *
	 * @throws Exception in case the test fails.
	 */
	@Test
	@Category(SkipOnElasticsearch.class) // Analyzers cannot be retrieved directly when using Elasticsearch
	public void testAnalyzers() throws Exception {
		FullTextSession fts = Search.getFullTextSession( openSession() );

		Analyzer analyzer = fts.getSearchFactory().getAnalyzer( "standard_analyzer" );
		String text = "This is just FOOBAR's";
		Token[] tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
		assertTokensEqual( tokens, new String[] { "This", "is", "just", "FOOBAR's" } );

		analyzer = fts.getSearchFactory().getAnalyzer( "html_standard_analyzer" );
		text = "This is <b>foo</b><i>bar's</i>";
		tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
		assertTokensEqual( tokens, new String[] { "This", "is", "foobar's" } );

		analyzer = fts.getSearchFactory().getAnalyzer( "html_whitespace_analyzer" );
		text = "This is <b>foo</b><i>bar's</i>";
		tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
		assertTokensEqual( tokens, new String[] { "This", "is", "foobar's" } );

		analyzer = fts.getSearchFactory().getAnalyzer( "trim_analyzer" );
		text = " Kittens!   ";
		tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
		assertTokensEqual( tokens, new String[] { "Kittens!" } );

		analyzer = fts.getSearchFactory().getAnalyzer( "length_analyzer" );
		text = "ab abc abcd abcde abcdef";
		tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
		assertTokensEqual( tokens, new String[] { "abc", "abcd", "abcde" } );

		analyzer = fts.getSearchFactory().getAnalyzer( "length_analyzer" );
		text = "ab abc abcd abcde abcdef";
		tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
		assertTokensEqual( tokens, new String[] { "abc", "abcd", "abcde" } );

		analyzer = fts.getSearchFactory().getAnalyzer( "porter_analyzer" );
		text = "bikes bikes biking";
		tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
		assertTokensEqual( tokens, new String[] { "bike", "bike", "bike" } );

		analyzer = fts.getSearchFactory().getAnalyzer( "word_analyzer" );
		text = "CamelCase";
		tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
		assertTokensEqual( tokens, new String[] { "Camel", "Case" } );

		analyzer = fts.getSearchFactory().getAnalyzer( "synonym_analyzer" );
		text = "ipod universe cosmos";
		tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
		assertTokensEqual( tokens, new String[] { "ipod", "universe", "universe" } );

		analyzer = fts.getSearchFactory().getAnalyzer( "shingle_analyzer" );
		text = "please divide this sentence into shingles";
		tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
		assertTokensEqual(
				tokens,
				new String[] {
						"please",
						"please divide",
						"divide",
						"divide this",
						"this",
						"this sentence",
						"sentence",
						"sentence into",
						"into",
						"into shingles",
						"shingles"
				}
		);

		analyzer = fts.getSearchFactory().getAnalyzer( "phonetic_analyzer" );
		text = "The quick brown fox jumped over the lazy dogs";
		tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
		AnalyzerUtils.displayTokens( analyzer, "name", text );
		assertTokensEqual(
				tokens, new String[] { "0", "KK", "BRN", "FKS", "JMPT", "OFR", "0", "LS", "TKS" }
		);

		analyzer = fts.getSearchFactory().getAnalyzer( "pattern_analyzer" );
		text = "foo,bar";
		tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
		assertTokensEqual( tokens, new String[] { "foo", "bar" } );

		// CharStreamFactories test
		analyzer = fts.getSearchFactory().getAnalyzer( "mapping_char_analyzer" );
		text = "CORA\u00C7\u00C3O DE MEL\u00C3O";
		tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
		assertTokensEqual( tokens, new String[] { "CORACAO", "DE", "MELAO" } );

		fts.close();
	}

	@Override
	public Class<?>[] getAnnotatedClasses() {
		return new Class[] {
				Team.class
		};
	}

	@Override
	public void configure(Map<String,Object> cfg) {
		super.configure( cfg );
		cfg.put( "hibernate.search.lucene_version", org.apache.lucene.util.Version.LATEST.toString() );
	}
}