/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search; import java.io.IOException; import java.util.LinkedList; import org.apache.lucene.analysis.CannedTokenStream; import org.apache.lucene.analysis.Token; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.junit.Ignore; /** * This class tests the MultiPhraseQuery class. * * */ public class TestMultiPhraseQuery extends LuceneTestCase { public void testPhrasePrefix() throws IOException { Directory indexStore = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore); add("blueberry pie", writer); add("blueberry strudel", writer); add("blueberry pizza", writer); add("blueberry chewing gum", writer); add("bluebird pizza", writer); add("bluebird foobar pizza", writer); add("piccadilly circus", writer); IndexReader reader = writer.getReader(); IndexSearcher searcher = newSearcher(reader); // search for "blueberry pi*": MultiPhraseQuery.Builder query1builder = new MultiPhraseQuery.Builder(); // search for "strawberry pi*": MultiPhraseQuery.Builder query2builder = new MultiPhraseQuery.Builder(); query1builder.add(new Term("body", "blueberry")); query2builder.add(new Term("body", "strawberry")); LinkedList<Term> termsWithPrefix = new LinkedList<>(); // this TermEnum gives "piccadilly", "pie" and "pizza". String prefix = "pi"; TermsEnum te = MultiFields.getFields(reader).terms("body").iterator(); te.seekCeil(new BytesRef(prefix)); do { String s = te.term().utf8ToString(); if (s.startsWith(prefix)) { termsWithPrefix.add(new Term("body", s)); } else { break; } } while (te.next() != null); query1builder.add(termsWithPrefix.toArray(new Term[0])); MultiPhraseQuery query1 = query1builder.build(); assertEquals("body:\"blueberry (piccadilly pie pizza)\"", query1.toString()); query2builder.add(termsWithPrefix.toArray(new Term[0])); MultiPhraseQuery query2 = query2builder.build(); assertEquals("body:\"strawberry (piccadilly pie pizza)\"", query2.toString()); ScoreDoc[] result; result = searcher.search(query1, 1000).scoreDocs; assertEquals(2, result.length); result = searcher.search(query2, 1000).scoreDocs; assertEquals(0, result.length); // search for "blue* pizza": MultiPhraseQuery.Builder query3builder = new MultiPhraseQuery.Builder(); termsWithPrefix.clear(); prefix = "blue"; te.seekCeil(new BytesRef(prefix)); do { if (te.term().utf8ToString().startsWith(prefix)) { termsWithPrefix.add(new Term("body", te.term().utf8ToString())); } } while (te.next() != null); query3builder.add(termsWithPrefix.toArray(new Term[0])); query3builder.add(new Term("body", "pizza")); MultiPhraseQuery query3 = query3builder.build(); result = searcher.search(query3, 1000).scoreDocs; assertEquals(2, result.length); // blueberry pizza, bluebird pizza assertEquals("body:\"(blueberry bluebird) pizza\"", query3.toString()); // test slop: query3builder.setSlop(1); query3 = query3builder.build(); result = searcher.search(query3, 1000).scoreDocs; // just make sure no exc: searcher.explain(query3, 0); assertEquals(3, result.length); // blueberry pizza, bluebird pizza, bluebird // foobar pizza MultiPhraseQuery.Builder query4builder = new MultiPhraseQuery.Builder(); expectThrows(IllegalArgumentException.class, () -> { query4builder.add(new Term("field1", "foo")); query4builder.add(new Term("field2", "foobar")); }); writer.close(); reader.close(); indexStore.close(); } // LUCENE-2580 public void testTall() throws IOException { Directory indexStore = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore); add("blueberry chocolate pie", writer); add("blueberry chocolate tart", writer); IndexReader r = writer.getReader(); writer.close(); IndexSearcher searcher = newSearcher(r); MultiPhraseQuery.Builder qb = new MultiPhraseQuery.Builder(); qb.add(new Term("body", "blueberry")); qb.add(new Term("body", "chocolate")); qb.add(new Term[] {new Term("body", "pie"), new Term("body", "tart")}); assertEquals(2, searcher.search(qb.build(), 1).totalHits); r.close(); indexStore.close(); } @Ignore //LUCENE-3821 fixes sloppy phrase scoring, except for this known problem public void testMultiSloppyWithRepeats() throws IOException { Directory indexStore = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore); add("a b c d e f g h i k", writer); IndexReader r = writer.getReader(); writer.close(); IndexSearcher searcher = newSearcher(r); MultiPhraseQuery.Builder qb = new MultiPhraseQuery.Builder(); // this will fail, when the scorer would propagate [a] rather than [a,b], qb.add(new Term[] {new Term("body", "a"), new Term("body", "b")}); qb.add(new Term[] {new Term("body", "a")}); qb.setSlop(6); assertEquals(1, searcher.search(qb.build(), 1).totalHits); // should match on "a b" r.close(); indexStore.close(); } public void testMultiExactWithRepeats() throws IOException { Directory indexStore = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore); add("a b c d e f g h i k", writer); IndexReader r = writer.getReader(); writer.close(); IndexSearcher searcher = newSearcher(r); MultiPhraseQuery.Builder qb = new MultiPhraseQuery.Builder(); qb.add(new Term[] {new Term("body", "a"), new Term("body", "d")}, 0); qb.add(new Term[] {new Term("body", "a"), new Term("body", "f")}, 2); assertEquals(1, searcher.search(qb.build(), 1).totalHits); // should match on "a b" r.close(); indexStore.close(); } private void add(String s, RandomIndexWriter writer) throws IOException { Document doc = new Document(); doc.add(newTextField("body", s, Field.Store.YES)); writer.addDocument(doc); } public void testBooleanQueryContainingSingleTermPrefixQuery() throws IOException { // this tests against bug 33161 (now fixed) // In order to cause the bug, the outer query must have more than one term // and all terms required. // The contained PhraseMultiQuery must contain exactly one term array. Directory indexStore = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore); add("blueberry pie", writer); add("blueberry chewing gum", writer); add("blue raspberry pie", writer); IndexReader reader = writer.getReader(); IndexSearcher searcher = newSearcher(reader); // This query will be equivalent to +body:pie +body:"blue*" BooleanQuery.Builder q = new BooleanQuery.Builder(); q.add(new TermQuery(new Term("body", "pie")), BooleanClause.Occur.MUST); MultiPhraseQuery.Builder troubleBuilder = new MultiPhraseQuery.Builder(); troubleBuilder.add(new Term[] {new Term("body", "blueberry"), new Term("body", "blue")}); q.add(troubleBuilder.build(), BooleanClause.Occur.MUST); // exception will be thrown here without fix ScoreDoc[] hits = searcher.search(q.build(), 1000).scoreDocs; assertEquals("Wrong number of hits", 2, hits.length); // just make sure no exc: searcher.explain(q.build(), 0); writer.close(); reader.close(); indexStore.close(); } public void testPhrasePrefixWithBooleanQuery() throws IOException { Directory indexStore = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore); add("This is a test", "object", writer); add("a note", "note", writer); IndexReader reader = writer.getReader(); IndexSearcher searcher = newSearcher(reader); // This query will be equivalent to +type:note +body:"a t*" BooleanQuery.Builder q = new BooleanQuery.Builder(); q.add(new TermQuery(new Term("type", "note")), BooleanClause.Occur.MUST); MultiPhraseQuery.Builder troubleBuilder = new MultiPhraseQuery.Builder(); troubleBuilder.add(new Term("body", "a")); troubleBuilder .add(new Term[] {new Term("body", "test"), new Term("body", "this")}); q.add(troubleBuilder.build(), BooleanClause.Occur.MUST); // exception will be thrown here without fix for #35626: ScoreDoc[] hits = searcher.search(q.build(), 1000).scoreDocs; assertEquals("Wrong number of hits", 0, hits.length); writer.close(); reader.close(); indexStore.close(); } public void testNoDocs() throws Exception { Directory indexStore = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore); add("a note", "note", writer); IndexReader reader = writer.getReader(); IndexSearcher searcher = newSearcher(reader); MultiPhraseQuery.Builder qb = new MultiPhraseQuery.Builder(); qb.add(new Term("body", "a")); qb.add(new Term[] {new Term("body", "nope"), new Term("body", "nope")}); MultiPhraseQuery q = qb.build(); assertEquals("Wrong number of hits", 0, searcher.search(q, 1).totalHits); // just make sure no exc: searcher.explain(q, 0); writer.close(); reader.close(); indexStore.close(); } public void testHashCodeAndEquals() { MultiPhraseQuery.Builder query1builder = new MultiPhraseQuery.Builder(); MultiPhraseQuery query1 = query1builder.build(); MultiPhraseQuery.Builder query2builder = new MultiPhraseQuery.Builder(); MultiPhraseQuery query2 = query2builder.build(); assertEquals(query1.hashCode(), query2.hashCode()); assertEquals(query1, query2); Term term1 = new Term("someField", "someText"); query1builder.add(term1); query1 = query1builder.build(); query2builder.add(term1); query2 = query2builder.build(); assertEquals(query1.hashCode(), query2.hashCode()); assertEquals(query1, query2); Term term2 = new Term("someField", "someMoreText"); query1builder.add(term2); query1 = query1builder.build(); assertFalse(query1.hashCode() == query2.hashCode()); assertFalse(query1.equals(query2)); query2builder.add(term2); query2 = query2builder.build(); assertEquals(query1.hashCode(), query2.hashCode()); assertEquals(query1, query2); } private void add(String s, String type, RandomIndexWriter writer) throws IOException { Document doc = new Document(); doc.add(newTextField("body", s, Field.Store.YES)); doc.add(newStringField("type", type, Field.Store.NO)); writer.addDocument(doc); } // LUCENE-2526 public void testEmptyToString() { new MultiPhraseQuery.Builder().build().toString(); } public void testZeroPosIncr() throws IOException { Directory dir = new RAMDirectory(); final Token[] tokens = new Token[3]; tokens[0] = new Token(); tokens[0].append("a"); tokens[0].setPositionIncrement(1); tokens[1] = new Token(); tokens[1].append("b"); tokens[1].setPositionIncrement(0); tokens[2] = new Token(); tokens[2].append("c"); tokens[2].setPositionIncrement(0); RandomIndexWriter writer = new RandomIndexWriter(random(), dir); Document doc = new Document(); doc.add(new TextField("field", new CannedTokenStream(tokens))); writer.addDocument(doc); doc = new Document(); doc.add(new TextField("field", new CannedTokenStream(tokens))); writer.addDocument(doc); IndexReader r = writer.getReader(); writer.close(); IndexSearcher s = newSearcher(r); MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder(); //mpq.setSlop(1); // NOTE: not great that if we do the else clause here we // get different scores! MultiPhraseQuery counts that // phrase as occurring twice per doc (it should be 1, I // think?). This is because MultipleTermPositions is able to // return the same position more than once (0, in this // case): if (true) { mpqb.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0); mpqb.add(new Term[] {new Term("field", "a")}, 0); } else { mpqb.add(new Term[] {new Term("field", "a")}, 0); mpqb.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0); } TopDocs hits = s.search(mpqb.build(), 2); assertEquals(2, hits.totalHits); assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5); /* for(int hit=0;hit<hits.totalHits;hit++) { ScoreDoc sd = hits.scoreDocs[hit]; System.out.println(" hit doc=" + sd.doc + " score=" + sd.score); } */ r.close(); dir.close(); } private static Token makeToken(String text, int posIncr) { final Token t = new Token(); t.append(text); t.setPositionIncrement(posIncr); return t; } private final static Token[] INCR_0_DOC_TOKENS = new Token[] { makeToken("x", 1), makeToken("a", 1), makeToken("1", 0), makeToken("m", 1), // not existing, relying on slop=2 makeToken("b", 1), makeToken("1", 0), makeToken("n", 1), // not existing, relying on slop=2 makeToken("c", 1), makeToken("y", 1) }; private final static Token[] INCR_0_QUERY_TOKENS_AND = new Token[] { makeToken("a", 1), makeToken("1", 0), makeToken("b", 1), makeToken("1", 0), makeToken("c", 1) }; private final static Token[][] INCR_0_QUERY_TOKENS_AND_OR_MATCH = new Token[][] { { makeToken("a", 1) }, { makeToken("x", 1), makeToken("1", 0) }, { makeToken("b", 2) }, { makeToken("x", 2), makeToken("1", 0) }, { makeToken("c", 3) } }; private final static Token[][] INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN = new Token[][] { { makeToken("x", 1) }, { makeToken("a", 1), makeToken("1", 0) }, { makeToken("x", 2) }, { makeToken("b", 2), makeToken("1", 0) }, { makeToken("c", 3) } }; /** * using query parser, MPQ will be created, and will not be strict about having all query terms * in each position - one of each position is sufficient (OR logic) */ public void testZeroPosIncrSloppyParsedAnd() throws IOException { MultiPhraseQuery.Builder qb = new MultiPhraseQuery.Builder(); qb.add(new Term[]{ new Term("field", "a"), new Term("field", "1") }, -1); qb.add(new Term[]{ new Term("field", "b"), new Term("field", "1") }, 0); qb.add(new Term[]{ new Term("field", "c") }, 1); doTestZeroPosIncrSloppy(qb.build(), 0); qb.setSlop(1); doTestZeroPosIncrSloppy(qb.build(), 0); qb.setSlop(2); doTestZeroPosIncrSloppy(qb.build(), 1); } private void doTestZeroPosIncrSloppy(Query q, int nExpected) throws IOException { Directory dir = newDirectory(); // random dir IndexWriterConfig cfg = newIndexWriterConfig(null); IndexWriter writer = new IndexWriter(dir, cfg); Document doc = new Document(); doc.add(new TextField("field", new CannedTokenStream(INCR_0_DOC_TOKENS))); writer.addDocument(doc); IndexReader r = DirectoryReader.open(writer); writer.close(); IndexSearcher s = newSearcher(r); if (VERBOSE) { System.out.println("QUERY=" + q); } TopDocs hits = s.search(q, 1); assertEquals("wrong number of results", nExpected, hits.totalHits); if (VERBOSE) { for(int hit=0;hit<hits.totalHits;hit++) { ScoreDoc sd = hits.scoreDocs[hit]; System.out.println(" hit doc=" + sd.doc + " score=" + sd.score); } } r.close(); dir.close(); } /** * PQ AND Mode - Manually creating a phrase query */ public void testZeroPosIncrSloppyPqAnd() throws IOException { PhraseQuery.Builder builder = new PhraseQuery.Builder(); int pos = -1; for (Token tap : INCR_0_QUERY_TOKENS_AND) { pos += tap.getPositionIncrement(); builder.add(new Term("field", tap.toString()), pos); } builder.setSlop(0); doTestZeroPosIncrSloppy(builder.build(), 0); builder.setSlop(1); doTestZeroPosIncrSloppy(builder.build(), 0); builder.setSlop(2); doTestZeroPosIncrSloppy(builder.build(), 1); } /** * MPQ AND Mode - Manually creating a multiple phrase query */ public void testZeroPosIncrSloppyMpqAnd() throws IOException { final MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder(); int pos = -1; for (Token tap : INCR_0_QUERY_TOKENS_AND) { pos += tap.getPositionIncrement(); mpqb.add(new Term[]{new Term("field",tap.toString())}, pos); //AND logic } doTestZeroPosIncrSloppy(mpqb.build(), 0); mpqb.setSlop(1); doTestZeroPosIncrSloppy(mpqb.build(), 0); mpqb.setSlop(2); doTestZeroPosIncrSloppy(mpqb.build(), 1); } /** * MPQ Combined AND OR Mode - Manually creating a multiple phrase query */ public void testZeroPosIncrSloppyMpqAndOrMatch() throws IOException { final MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder(); for (Token tap[] : INCR_0_QUERY_TOKENS_AND_OR_MATCH) { Term[] terms = tapTerms(tap); final int pos = tap[0].getPositionIncrement()-1; mpqb.add(terms, pos); //AND logic in pos, OR across lines } doTestZeroPosIncrSloppy(mpqb.build(), 0); mpqb.setSlop(1); doTestZeroPosIncrSloppy(mpqb.build(), 0); mpqb.setSlop(2); doTestZeroPosIncrSloppy(mpqb.build(), 1); } /** * MPQ Combined AND OR Mode - Manually creating a multiple phrase query - with no match */ public void testZeroPosIncrSloppyMpqAndOrNoMatch() throws IOException { final MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder(); for (Token tap[] : INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN) { Term[] terms = tapTerms(tap); final int pos = tap[0].getPositionIncrement()-1; mpqb.add(terms, pos); //AND logic in pos, OR across lines } doTestZeroPosIncrSloppy(mpqb.build(), 0); mpqb.setSlop(2); doTestZeroPosIncrSloppy(mpqb.build(), 0); } private Term[] tapTerms(Token[] tap) { Term[] terms = new Term[tap.length]; for (int i=0; i<terms.length; i++) { terms[i] = new Term("field",tap[i].toString()); } return terms; } public void testNegativeSlop() throws Exception { MultiPhraseQuery.Builder queryBuilder = new MultiPhraseQuery.Builder(); queryBuilder.add(new Term("field", "two")); queryBuilder.add(new Term("field", "one")); expectThrows(IllegalArgumentException.class, () -> { queryBuilder.setSlop(-2); }); } }