/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.uhighlight; import java.io.IOException; import java.util.Collection; import java.util.Collections; import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.Weight; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.QueryBuilder; import org.junit.After; import org.junit.Before; @LuceneTestCase.SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom", "Lucene3x"}) @LuceneTestCase.SuppressSysoutChecks(bugUrl = "")//Gradle interferes with this Lucene test rule public class TestUnifiedHighlighterStrictPhrases extends LuceneTestCase { final FieldType fieldType; Directory dir; MockAnalyzer indexAnalyzer; RandomIndexWriter indexWriter; IndexSearcher searcher; UnifiedHighlighter highlighter; IndexReader indexReader; @ParametersFactory public static Iterable<Object[]> parameters() { return UHTestHelper.parametersFactoryList(); } public TestUnifiedHighlighterStrictPhrases(FieldType fieldType) { this.fieldType = fieldType; } @Before public void doBefore() throws IOException { indexAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);//whitespace, punctuation, lowercase indexAnalyzer.setPositionIncrementGap(3);// more than default dir = newDirectory(); indexWriter = new RandomIndexWriter(random(), dir, indexAnalyzer); } @After public void doAfter() throws IOException { IOUtils.close(indexReader, indexWriter, dir); } private Document newDoc(String... bodyVals) { Document doc = new Document(); for (String bodyVal : bodyVals) { doc.add(new Field("body", bodyVal, fieldType)); } return doc; } private void initReaderSearcherHighlighter() throws IOException { indexReader = indexWriter.getReader(); searcher = newSearcher(indexReader); highlighter = new UnifiedHighlighter(searcher, indexAnalyzer); highlighter.setHighlightPhrasesStrictly(true); } private PhraseQuery newPhraseQuery(String field, String phrase) { return (PhraseQuery) new QueryBuilder(indexAnalyzer).createPhraseQuery(field, phrase); } private PhraseQuery setSlop(PhraseQuery query, int slop) { PhraseQuery.Builder builder = new PhraseQuery.Builder(); Term[] terms = query.getTerms(); int[] positions = query.getPositions(); for (int i = 0; i < terms.length; i++) { builder.add(terms[i], positions[i]); } builder.setSlop(slop); return builder.build(); } public void testBasics() throws IOException { indexWriter.addDocument(newDoc("Yin yang, filter")); // filter out. test getTermToSpanLists reader 1-doc filter indexWriter.addDocument(newDoc("yin alone, Yin yang, yin gap yang")); initReaderSearcherHighlighter(); //query: -filter +"yin yang" BooleanQuery query = new BooleanQuery.Builder() .add(new TermQuery(new Term("body", "filter")), BooleanClause.Occur.MUST_NOT) .add(newPhraseQuery("body", "yin yang"), BooleanClause.Occur.MUST) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); assertArrayEquals(new String[]{"yin alone, <b>Yin</b> <b>yang</b>, yin gap yang"}, snippets); } public void testWithSameTermQuery() throws IOException { indexWriter.addDocument(newDoc("Yin yang, yin gap yang")); initReaderSearcherHighlighter(); BooleanQuery query = new BooleanQuery.Builder() .add(new TermQuery(new Term("body", "yin")), BooleanClause.Occur.MUST) .add(newPhraseQuery("body", "yin yang"), BooleanClause.Occur.MUST) // add queries for other fields; we shouldn't highlight these because of that. .add(new TermQuery(new Term("title", "yang")), BooleanClause.Occur.SHOULD) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); assertArrayEquals(new String[]{"<b>Yin</b> <b>yang</b>, <b>yin</b> gap yang"}, snippets); } public void testPhraseNotInDoc() throws IOException { indexWriter.addDocument(newDoc("Whatever yin")); // query matches this; highlight it indexWriter.addDocument(newDoc("nextdoc yin"));// query does NOT match this, only the SHOULD clause does initReaderSearcherHighlighter(); BooleanQuery query = new BooleanQuery.Builder() //MUST: .add(new TermQuery(new Term("body", "whatever")), BooleanClause.Occur.MUST) //SHOULD: (yet won't) .add(newPhraseQuery("body", "nextdoc yin"), BooleanClause.Occur.SHOULD) .add(newPhraseQuery("body", "nonexistent yin"), BooleanClause.Occur.SHOULD) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); assertArrayEquals(new String[]{"<b>Whatever</b> yin"}, snippets); } public void testSubPhrases() throws IOException { indexWriter.addDocument(newDoc("alpha bravo charlie - charlie bravo alpha")); initReaderSearcherHighlighter(); BooleanQuery query = new BooleanQuery.Builder() .add(newPhraseQuery("body", "alpha bravo charlie"), BooleanClause.Occur.MUST) .add(newPhraseQuery("body", "alpha bravo"), BooleanClause.Occur.MUST) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); assertArrayEquals(new String[]{"<b>alpha</b> <b>bravo</b> <b>charlie</b> - charlie bravo alpha"}, snippets); } public void testSynonyms() throws IOException { indexWriter.addDocument(newDoc("mother father w mom father w dad")); initReaderSearcherHighlighter(); MultiPhraseQuery query = new MultiPhraseQuery.Builder() .add(new Term[]{new Term("body", "mom"), new Term("body", "mother")}) .add(new Term[]{new Term("body", "dad"), new Term("body", "father")}) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); assertArrayEquals(new String[]{"<b>mother</b> <b>father</b> w <b>mom</b> <b>father</b> w dad"}, snippets); } /** * Test it does *not* highlight the same term's not next to the span-near. "charlie" in this case. * This particular example exercises "Rewrite" plus "MTQ" in the same query. */ public void testRewriteAndMtq() throws IOException { indexWriter.addDocument(newDoc("alpha bravo charlie - charlie bravo alpha")); initReaderSearcherHighlighter(); SpanNearQuery snq = new SpanNearQuery( new SpanQuery[]{ new SpanTermQuery(new Term("body", "bravo")), new SpanMultiTermQueryWrapper<>(new PrefixQuery(new Term("body", "ch")))}, // REWRITES 0, true); BooleanQuery query = new BooleanQuery.Builder() .add(snq, BooleanClause.Occur.MUST) .add(new PrefixQuery(new Term("body", "al")), BooleanClause.Occur.MUST) // MTQ .add(newPhraseQuery("body", "alpha bravo"), BooleanClause.Occur.MUST) // add queries for other fields; we shouldn't highlight these because of that. .add(newPhraseQuery("title", "bravo alpha"), BooleanClause.Occur.SHOULD) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); assertArrayEquals(new String[]{"<b>alpha</b> <b>bravo</b> <b>charlie</b> - charlie bravo <b>alpha</b>"}, snippets); // do again, this time with MTQ disabled. We should only find "alpha bravo". highlighter.setHandleMultiTermQuery(false);//disable but leave phrase processing enabled topDocs = searcher.search(query, 10, Sort.INDEXORDER); snippets = highlighter.highlight("body", query, topDocs); assertArrayEquals(new String[]{"<b>alpha</b> <b>bravo</b> charlie - charlie bravo alpha"}, snippets); } /** * Like {@link #testRewriteAndMtq} but no freestanding MTQ */ public void testRewrite() throws IOException { indexWriter.addDocument(newDoc("alpha bravo charlie - charlie bravo alpha")); initReaderSearcherHighlighter(); SpanNearQuery snq = new SpanNearQuery( new SpanQuery[]{ new SpanTermQuery(new Term("body", "bravo")), new SpanMultiTermQueryWrapper<>(new PrefixQuery(new Term("body", "ch")))}, // REWRITES 0, true); BooleanQuery query = new BooleanQuery.Builder() .add(snq, BooleanClause.Occur.MUST) // .add(new PrefixQuery(new Term("body", "al")), BooleanClause.Occur.MUST) // MTQ .add(newPhraseQuery("body", "alpha bravo"), BooleanClause.Occur.MUST) // add queries for other fields; we shouldn't highlight these because of that. .add(newPhraseQuery("title", "bravo alpha"), BooleanClause.Occur.SHOULD) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); assertArrayEquals(new String[]{"<b>alpha</b> <b>bravo</b> <b>charlie</b> - charlie bravo alpha"}, snippets); // do again, this time with MTQ disabled. We should only find "alpha bravo". highlighter.setHandleMultiTermQuery(false);//disable but leave phrase processing enabled topDocs = searcher.search(query, 10, Sort.INDEXORDER); snippets = highlighter.highlight("body", query, topDocs); assertArrayEquals(new String[]{"<b>alpha</b> <b>bravo</b> charlie - charlie bravo alpha"}, snippets); } /** * Like {@link #testRewriteAndMtq} but no rewrite. */ public void testMtq() throws IOException { indexWriter.addDocument(newDoc("alpha bravo charlie - charlie bravo alpha")); initReaderSearcherHighlighter(); SpanNearQuery snq = new SpanNearQuery( new SpanQuery[]{ new SpanTermQuery(new Term("body", "bravo")), new SpanTermQuery(new Term("body", "charlie"))}, // does NOT rewrite 0, true); BooleanQuery query = new BooleanQuery.Builder() .add(snq, BooleanClause.Occur.MUST) .add(new PrefixQuery(new Term("body", "al")), BooleanClause.Occur.MUST) // MTQ .add(newPhraseQuery("body", "alpha bravo"), BooleanClause.Occur.MUST) // add queries for other fields; we shouldn't highlight these because of that. .add(newPhraseQuery("title", "bravo alpha"), BooleanClause.Occur.SHOULD) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); assertArrayEquals(new String[]{"<b>alpha</b> <b>bravo</b> <b>charlie</b> - charlie bravo <b>alpha</b>"}, snippets); // do again, this time with MTQ disabled. highlighter.setHandleMultiTermQuery(false);//disable but leave phrase processing enabled topDocs = searcher.search(query, 10, Sort.INDEXORDER); snippets = highlighter.highlight("body", query, topDocs); assertArrayEquals(new String[]{"<b>alpha</b> <b>bravo</b> <b>charlie</b> - charlie bravo alpha"}, snippets); } public void testMultiValued() throws IOException { indexWriter.addDocument(newDoc("one bravo three", "four bravo six")); initReaderSearcherHighlighter(); BooleanQuery query = new BooleanQuery.Builder() .add(newPhraseQuery("body", "one bravo"), BooleanClause.Occur.MUST) .add(newPhraseQuery("body", "four bravo"), BooleanClause.Occur.MUST) .add(new PrefixQuery(new Term("body", "br")), BooleanClause.Occur.MUST) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs, 2); assertArrayEquals(new String[]{"<b>one</b> <b>bravo</b> three... <b>four</b> <b>bravo</b> six"}, snippets); // now test phraseQuery won't span across values assert indexAnalyzer.getPositionIncrementGap("body") > 0; PhraseQuery phraseQuery = newPhraseQuery("body", "three four"); // 1 too little; won't span phraseQuery = setSlop(phraseQuery, indexAnalyzer.getPositionIncrementGap("body") - 1); query = new BooleanQuery.Builder() .add(new TermQuery(new Term("body", "bravo")), BooleanClause.Occur.MUST) .add(phraseQuery, BooleanClause.Occur.SHOULD) .build(); topDocs = searcher.search(query, 10); snippets = highlighter.highlight("body", query, topDocs, 2); assertEquals("one <b>bravo</b> three... four <b>bravo</b> six", snippets[0]); // and add just enough slop to cross the values: phraseQuery = newPhraseQuery("body", "three four"); phraseQuery = setSlop(phraseQuery, indexAnalyzer.getPositionIncrementGap("body")); // just enough to span query = new BooleanQuery.Builder() .add(new TermQuery(new Term("body", "bravo")), BooleanClause.Occur.MUST) .add(phraseQuery, BooleanClause.Occur.MUST) // must match and it will .build(); topDocs = searcher.search(query, 10); assertEquals(1, topDocs.totalHits); snippets = highlighter.highlight("body", query, topDocs, 2); assertEquals("one <b>bravo</b> <b>three</b>... <b>four</b> <b>bravo</b> six", snippets[0]); } public void testMaxLen() throws IOException { indexWriter.addDocument(newDoc("alpha bravo charlie - gap alpha bravo")); // hyphen is at char 21 initReaderSearcherHighlighter(); highlighter.setMaxLength(21); BooleanQuery query = new BooleanQuery.Builder() .add(newPhraseQuery("body", "alpha bravo"), BooleanClause.Occur.MUST) .add(newPhraseQuery("body", "gap alpha"), BooleanClause.Occur.MUST) .add(newPhraseQuery("body", "charlie gap"), BooleanClause.Occur.SHOULD) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); if (fieldType == UHTestHelper.reanalysisType) { assertArrayEquals(new String[]{"<b>alpha</b> <b>bravo</b> charlie -"}, snippets); } else { assertArrayEquals(new String[]{"<b>alpha</b> <b>bravo</b> <b>charlie</b> -"}, snippets); } } public void testFilteredOutSpan() throws IOException { indexWriter.addDocument(newDoc("freezing cold stuff like stuff freedom of speech")); initReaderSearcherHighlighter(); WildcardQuery wildcardQuery = new WildcardQuery(new Term("body", "free*")); SpanMultiTermQueryWrapper<WildcardQuery> wildcardSpanQuery = new SpanMultiTermQueryWrapper<>(wildcardQuery); SpanTermQuery termQuery = new SpanTermQuery(new Term("body", "speech")); SpanQuery spanQuery = new SpanNearQuery(new SpanQuery[]{wildcardSpanQuery, termQuery}, 3, false); BooleanQuery query = new BooleanQuery.Builder() .add(spanQuery, BooleanClause.Occur.MUST) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); assertArrayEquals(new String[]{"freezing cold stuff like stuff <b>freedom</b> of <b>speech</b>"}, snippets); } public void testMatchNoDocsQuery() throws IOException { highlighter = new UnifiedHighlighter(null, indexAnalyzer); highlighter.setHighlightPhrasesStrictly(true); String content = "whatever"; Object o = highlighter.highlightWithoutSearcher("body", new MatchNoDocsQuery(), content, 1); assertEquals(content, o); } public void testPreSpanQueryRewrite() throws IOException { indexWriter.addDocument(newDoc("There is no accord and satisfaction with this - Consideration of the accord is arbitrary.")); initReaderSearcherHighlighter(); highlighter = new UnifiedHighlighter(searcher, indexAnalyzer) { @Override protected Collection<Query> preSpanQueryRewrite(Query query) { if (query instanceof MyQuery) { return Collections.singletonList(((MyQuery)query).wrapped); } return null; } }; highlighter.setHighlightPhrasesStrictly(true); BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); Query phraseQuery = new BoostQuery(new PhraseQuery("body", "accord", "and", "satisfaction"), 2.0f); Query oredTerms = new BooleanQuery.Builder() .setMinimumNumberShouldMatch(2) .add(new TermQuery(new Term("body", "accord")), BooleanClause.Occur.SHOULD) .add(new TermQuery(new Term("body", "satisfaction")), BooleanClause.Occur.SHOULD) .add(new TermQuery(new Term("body", "consideration")), BooleanClause.Occur.SHOULD) .build(); Query proximityBoostingQuery = new MyQuery(oredTerms); Query totalQuery = bqBuilder .add(phraseQuery, BooleanClause.Occur.SHOULD) .add(proximityBoostingQuery, BooleanClause.Occur.SHOULD) .build(); TopDocs topDocs = searcher.search(totalQuery, 10, Sort.INDEXORDER); assertEquals(1, topDocs.totalHits); String[] snippets = highlighter.highlight("body", totalQuery, topDocs); assertArrayEquals(new String[]{"There is no <b>accord</b> <b>and</b> <b>satisfaction</b> with this - <b>Consideration</b> of the <b>accord</b> is arbitrary."}, snippets); } private static class MyQuery extends Query { private final Query wrapped; MyQuery(Query wrapped) { this.wrapped = wrapped; } @Override public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { return wrapped.createWeight(searcher, needsScores, boost); } @Override public Query rewrite(IndexReader reader) throws IOException { Query newWrapped = wrapped.rewrite(reader); if (newWrapped != wrapped) { return new MyQuery(newWrapped); } return this; } @Override public String toString(String field) { return "[[["+wrapped.toString(field)+"]]]"; } @Override public boolean equals(Object obj) { return obj != null && obj.getClass() == getClass() && wrapped.equals(((MyQuery)wrapped).wrapped); } @Override public int hashCode() { return wrapped.hashCode(); } } }