/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.source.lucene; import java.io.ByteArrayOutputStream; import java.io.StringWriter; import java.util.Arrays; import java.util.List; import org.apache.lucene.analysis.core.SimpleAnalyzer; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.RAMDirectory; import org.carrot2.core.Document; import org.carrot2.core.ProcessingResult; import org.carrot2.core.attribute.AttributeNames; import org.carrot2.core.attribute.CommonAttributesDescriptor; import org.carrot2.core.test.QueryableDocumentSourceTestBase; import org.carrot2.util.attribute.AttributeUtils; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; /** * Tests Lucene document source. */ public class LuceneDocumentSourceTest extends QueryableDocumentSourceTestBase<LuceneDocumentSource> { private static SimpleAnalyzer analyzer; private static RAMDirectory directory; @BeforeClass public static void prepareIndex() throws Exception { directory = new RAMDirectory(); analyzer = new SimpleAnalyzer(); LuceneIndexUtils.createAndPopulateIndex(directory, analyzer); } @Before public void prepareComponent() { this.initAttributes.put( AttributeUtils.getKey(LuceneDocumentSource.class, "directory"), directory); this.initAttributes.put( AttributeUtils.getKey(SimpleFieldMapper.class, "titleField"), "title"); this.initAttributes.put( AttributeUtils.getKey(SimpleFieldMapper.class, "contentField"), "snippet"); this.initAttributes.put( AttributeUtils.getKey(SimpleFieldMapper.class, "urlField"), "url"); this.initAttributes.put( AttributeUtils.getKey(SimpleFieldMapper.class, "searchFields"), Arrays.asList(new String [] { "title", "snippet" })); } @Override public Class<LuceneDocumentSource> getComponentClass() { return LuceneDocumentSource.class; } @Override protected boolean hasUtfResults() { return false; } @Override protected String getSmallQueryText() { return "software"; } @Override protected int getSmallQuerySize() { return 13; } @Override protected String getLargeQueryText() { return "data mining"; } @Override protected int getLargeQuerySize() { return 100; } @Test public void testCustomFormatter() throws Exception { this.initAttributes.put( AttributeUtils.getKey(SimpleFieldMapper.class, "formatter"), SimpleHTMLFormatter.class); runQuery(getLargeQueryText(), getLargeQuerySize()); int highlights = 0; for (Document d : getDocuments()) { if (((String) d.getField(Document.SUMMARY)).indexOf("") >= 0) { highlights++; } } assertThat(highlights).as("Number of highlights").isGreaterThan(10); } @Test public void testCustomQuery() throws Exception { final BooleanQuery.Builder builder = new BooleanQuery.Builder(); builder.add(new TermQuery(new Term("snippet", "data")), Occur.MUST); this.processingAttributes.put(AttributeNames.QUERY, builder.build()); assertThat(runQuery(null, getLargeQuerySize())).as("Number of results") .isGreaterThan(10); } @Test public void testAdvancedQueries() throws Exception { assertThat(runQuery("\"data mining\"", getLargeQuerySize())).as( "Number of results").isEqualTo(99); } @Test public void testMultiEntryField() throws Exception { runQuery("\"termb\"", getLargeQuerySize()); final List<Document> list = getDocuments(); assertThat(list.size()).isEqualTo(1); assertThat(list.get(0).getSummary()).contains("terma"); assertThat(list.get(0).getSummary()).contains("termb"); } /** * Test case for CARROT-820. */ @Test public void testCatchAllQueryWithHighlighting() throws Exception { SimpleFieldMapperDescriptor.attributeBuilder(processingAttributes).formatter( PlainTextFormatter.class); runQuery("*:*", 2); final List<Document> list = getDocuments(); assertThat(list.size()).isEqualTo(2); assertThat(list.get(0).getSummary()).isNotEmpty(); assertThat(list.get(0).getSummary()).isNotEmpty(); } @Test public void luceneScorePassing() throws Exception { final int results = 10; assertThat(runQuery("\"data mining\"", results)).as("Number of results") .isEqualTo(results); for (Document document : getDocuments()) { assertThat(document.getScore()).isNotNull().isGreaterThan(0); } } /** * Keeping Lucene documents by default is not a good idea, because it would cause the * cache size to grow very quickly. */ @Test public void luceneDocumentNotPassedByDefault() throws Exception { final int results = 10; assertThat(runQuery("\"data mining\"", results)).as("Number of results") .isEqualTo(results); for (Document document : getDocuments()) { for (Object field : document.getFields().values()) { // Lucene Document class is final assertThat(field.getClass()).as("Field type").isNotEqualTo( org.apache.lucene.document.Document.class); } } } @Test public void luceneDocumentPassing() throws Exception { LuceneDocumentSourceDescriptor.attributeBuilder(processingAttributes) .keepLuceneDocuments(true); final int results = 10; assertThat(runQuery("\"data mining\"", results)).as("Number of results") .isEqualTo(results); for (Document document : getDocuments()) { assertThat((Object) document.getField(LuceneDocumentSource.LUCENE_DOCUMENT_FIELD)) .isInstanceOf(org.apache.lucene.document.Document.class); } } @Test public void luceneDocumentNotSerialized() throws Exception { final int results = 2; CommonAttributesDescriptor.attributeBuilder(processingAttributes) .query("\"data mining\"").results(results); LuceneDocumentSourceDescriptor.attributeBuilder(processingAttributes) .keepLuceneDocuments(true); final ProcessingResult result = getSimpleController(initAttributes).process( processingAttributes, LuceneDocumentSource.class); assertThat(result.getDocuments().size()).as("Number of results").isEqualTo( results); final StringWriter json = new StringWriter(); result.serializeJson(json); assertThat(json.toString()).doesNotContain("\"luceneDocument\""); final ByteArrayOutputStream xml = new ByteArrayOutputStream(); result.serialize(xml); assertThat(xml.toString("UTF-8")).doesNotContain( "org.apache.lucene.document.Document"); } }