package org.apache.lucene.queryparser.spans; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenFilter; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; public class TestOverallSpanQueryParser extends LuceneTestCase { private final static String FIELD1 = "f1"; private final static String FIELD2 = "f2"; private static Analyzer ANALYZER = null; private static final Analyzer MULTITERM_ANALYZER = null; private static Directory DIRECTORY = null; private static IndexReader READER = null; private static IndexSearcher SEARCHER = null; private static SpanQueryParser PARSER; @BeforeClass public static void beforeClass() throws Exception { ANALYZER = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true); ANALYZER = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true); DIRECTORY = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), DIRECTORY, newIndexWriterConfig(ANALYZER) .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) .setMergePolicy(newLogMergePolicy())); String[] f1Docs = new String[] { "quick brown AND fox",//0 "quick brown AND dog", //1 "quick brown dog", //2 "whan that aprile with its shoures perced", //3 "its shoures pierced", //4 "its shoures perced", //5 "#####", //before asterisk //6 "&&&&&", //after asterisk for range query //7 "ab*de", //8 "abcde", //9 "blah disco fever blah", //10 "blah bieber fever blah", //11 "blah dengue fever blah", //12 "blah saturday night fever with john travolta", //13 "understanding (span query)", //14 "understanding (sp'an query)",//15 "understanding something about (span query)",//16 "0 1 fox 3 4 5 fox 7 8 9 10 fox"//17 }; String [] f2Docs = new String[] { "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen" }; for (int i = 0; i < f1Docs.length; i++) { Document doc = new Document(); doc.add(newTextField(FIELD1, f1Docs[i], Field.Store.YES)); doc.add(newTextField(FIELD2, f2Docs[i], Field.Store.YES)); writer.addDocument(doc); } READER = writer.getReader(); SEARCHER = newSearcher(READER); writer.close(); PARSER = new SpanQueryParser(FIELD1, ANALYZER, MULTITERM_ANALYZER); } @AfterClass public static void afterClass() throws Exception { READER.close(); DIRECTORY.close(); READER = null; SEARCHER = null; DIRECTORY = null; ANALYZER = null; } public void testEscaping() throws Exception { //example to show escaping compareHits("\"understanding '(span' 'query)'\"", 14); compareHits("\"understanding '(sp''an' 'query)'\"", 15); compareHits("\"understanding \\(span query\\)\"", 14); compareHits("\"understanding \\(sp\\'an query\\)\"", 15); } public void testComplexQueries() throws Exception { //complex span not compareHits("+f1:[fever (bieber [jo*n travlota~1] disc*)]!~2,5 +f2:(ten eleven twelve thirteen)", 12); compareHits("+f1:[fever (bieber [jo*n travlota~1] disc*)]!~2,5 -f2:(ten eleven twelve thirteen)"); compareHits("+f1:[fever (bieber [travlota~1 jo*n]~>3 disc*)]!~2,5 +f2:(ten eleven twelve thirteen)", 12, 13); compareHits("+f1:[fever (bieber [jo*n travlota~1]~>3 disc*)]!~2,5 +f2:(ten eleven twelve thirteen)", 12); compareHits("-f1:[fever (bieber [jo*n travlota~1]~>3 disc*)]!~2,5 +f2:(ten eleven twelve thirteen)", 10, 11, 13); } public void testNegativeOnly() throws Exception { //negative only queries compareHits("-fever", 0,1,2,3,4,5,6,7,8,9,14,15,16,17); compareHits("-f1:fever", 0,1,2,3,4,5,6,7,8,9,14,15,16,17); compareHits("-fever -brown", 3,4,5,6,7,8,9,14,15,16,17); } public void testUnlimitedRange() throws Exception { //just make sure that -1 is interpreted as infinity PARSER.setSpanNearMaxDistance(-1); PARSER.setPhraseSlop(0); compareHits("[quick dog]~10", 1, 2); PARSER.setSpanNearMaxDistance(100); } public void testBooleanQueryConstruction() throws Exception { String s = "cat dog AND elephant aardvark"; Query q = PARSER.parse(s); assertTrue(q instanceof BooleanQuery); BooleanQuery bq = (BooleanQuery)q; List<BooleanClause> clauses = bq.clauses(); assertEquals(4, clauses.size()); testForClause(clauses, "cat", Occur.SHOULD); testForClause(clauses, "dog", Occur.MUST); testForClause(clauses, "elephant", Occur.MUST); testForClause(clauses, "aardvark", Occur.SHOULD); s = "cat dog NOT elephant aardvark"; q = PARSER.parse(s); assertTrue(q instanceof BooleanQuery); bq = (BooleanQuery)q; clauses = bq.clauses(); assertEquals(4, clauses.size()); testForClause(clauses, "cat", Occur.SHOULD); testForClause(clauses, "dog", Occur.SHOULD); testForClause(clauses, "elephant", Occur.MUST_NOT); testForClause(clauses, "aardvark", Occur.SHOULD); s = "cat +dog -elephant +aardvark"; q = PARSER.parse(s); assertTrue(q instanceof BooleanQuery); bq = (BooleanQuery)q; clauses = bq.clauses(); assertEquals(4, clauses.size()); testForClause(clauses, "cat", Occur.SHOULD); testForClause(clauses, "dog", Occur.MUST); testForClause(clauses, "elephant", Occur.MUST_NOT); testForClause(clauses, "aardvark", Occur.MUST); } public void testFields() throws Exception { compareHits("f1:brown f2:three", 0, 1, 2, 3); //four should go back to f1 compareHits("f1:brown f2:three four", 0, 1, 2, 3); compareHits("f1:brown f2:(three four)", 0, 1, 2, 3, 4); compareHits("f1:brown f2:(three four) five", 0, 1, 2, 3, 4); compareHits("f1:brown f2:(three four) f2:five", 0, 1, 2, 3, 4, 5); compareHits("f1:brown f2:(f1:three four) f2:five", 0, 1, 2, 4, 5); SpanQueryParser p = new SpanQueryParser(FIELD2, ANALYZER, MULTITERM_ANALYZER); compareHits(p, "f1:brown three four", 0, 1, 2, 3, 4); compareHits(p, "f1:brown (three four)", 0, 1, 2, 3, 4); compareHits(p, "f1:brown (three four) five", 0, 1, 2, 3, 4, 5); compareHits(p, "f1:brown (three four) five", 0, 1, 2, 3, 4, 5); compareHits(p, "f1:brown (f1:three four) five", 0, 1, 2, 4, 5); } public void testBooleanOrHits() throws Exception { compareHits("f2:three (brown dog)", 0, 1, 2, 3); compareHits("f2:three (brown dog)~2", 1, 2, 3); } public void testBooleanHits() throws Exception { //test treatment of AND within phrase compareHits("quick NOT [brown AND (fox dog)]", 2); compareHits("quick AND [bruwn~1 AND (f?x do?)]", 0, 1); compareHits("(whan AND aprile) (shoures NOT perced)", 3, 4); } private void testForClause(List<BooleanClause> clauses, String term, Occur occur) { assertTrue(clauses.contains( new BooleanClause( new SpanTermQuery( new Term(FIELD1, term)), occur)) || clauses.contains( new BooleanClause(new TermQuery(new Term(FIELD1, term)), occur)) ); } private void compareHits(String s, int ... docids ) throws Exception{ compareHits(new SpanQueryParser(FIELD1, ANALYZER, MULTITERM_ANALYZER), s, docids); } private void compareHits(SpanQueryParser p, String s, int ... docids) throws Exception { compareHits(p, s, SEARCHER, docids); } private void compareHits(SpanQueryParser p, String s, IndexSearcher searcher, int ... docids) throws Exception{ Query q = p.parse(s); TopScoreDocCollector results = TopScoreDocCollector.create(1000); searcher.search(q, results); ScoreDoc[] scoreDocs = results.topDocs().scoreDocs; Set<Integer> hits = new HashSet<>(); for (int i = 0; i < scoreDocs.length; i++) { hits.add(scoreDocs[i].doc); } assertEquals(docids.length, hits.size()); for (int i = 0; i < docids.length; i++) { assertTrue("couldn't find " + Integer.toString(docids[i]) + " among the hits", hits.contains(docids[i])); } } public void testExceptions() { String[] strings = new String[]{ "cat OR OR dog", "cat OR AND dog", "cat AND AND dog", "cat NOT NOT dog", "cat NOT AND dog", "cat NOT OR dog", "cat NOT -dog", "cat NOT +dog", "OR", "AND dog", "OR dog", "dog AND", "dog OR", "dog NOT", }; for (String s : strings) { testException(s, PARSER); } } private void testException(String s, SpanQueryParser p) { try { p.parse(s); fail("didn't get expected exception:"+s); } catch (ParseException expected) {} } public void testIsEscaped() throws Exception{ String[] notEscaped = new String[]{ "abcd", "a\\\\d", }; for (String s : notEscaped) { assertFalse(s, isCharEscaped(s, 3)); } String[] escaped = new String[]{ "ab\\d", "\\\\\\d", }; for (String s : escaped) { assertTrue(s, isCharEscaped(s, 3)); } Query q = PARSER.parse("abc\\~2.0"); assertTrue(q.toString(), q instanceof TermQuery); q = PARSER.parse("abc\\\\\\~2.0"); assertTrue(q.toString(), q instanceof TermQuery); q = PARSER.parse("abc\\\\~2.0"); assertTrue(q.toString(), q instanceof FuzzyQuery); q = PARSER.parse("abc\\*d"); assertTrue(q.toString(), q instanceof TermQuery); q = PARSER.parse("abc\\\\\\*d"); assertTrue(q.toString(), q instanceof TermQuery); q = PARSER.parse("abc\\\\*d"); assertTrue(q.toString(), q instanceof WildcardQuery); } public void testStops() throws Exception { Analyzer stopsAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET); Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig(stopsAnalyzer) .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) .setMergePolicy(newLogMergePolicy())); String[] docs = new String[] { "ab the the cd the the the ef the gh", "ab cd", "ab the ef" }; for (int i = 0; i < docs.length; i++) { Document doc = new Document(); doc.add(newTextField(FIELD1, docs[i], Field.Store.YES)); w.addDocument(doc); } IndexReader r = w.getReader(); IndexSearcher s = newSearcher(r); w.close(); SpanQueryParser p = new SpanQueryParser(FIELD1, stopsAnalyzer, MULTITERM_ANALYZER); assertHits( "-ab +the +cd", p, s, 0); assertHits( "+ab +the +cd", p, s, 2); assertHits( "+the", p, s, 0); assertHits( "ab AND CD", p, s, 2); assertHits( "ab AND the", p, s, 3); assertHits( "ab OR the", p, s, 3); assertHits( "(ab the cd)~2", p, s, 2); assertHits( "(ab the cd)~3", p, s, 0); assertHits( "ab AND (the OR cd)", p, s, 2); assertHits( "ab AND (the AND cd)", p, s, 2); assertHits( "cd OR (the OR ef)", p, s, 3); assertHits( "cd AND (the AND ef)", p, s, 1); //do we want this behavior? assertHits( "-the", p, s, 0); assertHits ("\"ab cd\"", p, s, 1); assertHits ("\"ab a a cd\"", p, s, 2); assertHits ("\"ab a cd\"~1", p, s, 2); assertHits ("\"ab a cd\"~>1", p, s, 2); assertHits ("\"cd a a ab\"", p, s, 0); assertHits ("\"cd a ab\"~1", p, s, 2); r.close(); dir.close(); } @Test public void testSpanPositionRangeQueries() throws Exception { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig(ANALYZER) .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) .setMergePolicy(newLogMergePolicy())); String[] docs = new String[] { "zebra 1 2 3 4 5 6 7 8 9 10", "0 1 2 3 zebra 5 6 7 8 9 10", "0 1 2 3 4 5 6 7 zebra 9 10", "a foo bar a b a b a b a b a b a b a b a b", "a b a b a b a b a foo bar a b a b a b a b", "a b a b a b a b a b a b a b a b a foo bar", }; for (int i = 0; i < docs.length; i++) { Document doc = new Document(); doc.add(newTextField(FIELD1, docs[i], Field.Store.YES)); w.addDocument(doc); } IndexReader r = w.getReader(); IndexSearcher s = newSearcher(r); w.close(); // compareHits(PARSER, "(foo@13.. bar@13..)", s, 5); testException("(foo bar)~2@..13", PARSER); //basic for (String term : new String[]{ "zebra", "zeabra~1", "z?bra", "zebr*", "ze*ra", "zebar~1" }) { compareHits(PARSER, term, s, 0, 1, 2); compareHits(PARSER, term+"@2..", s, 1, 2); compareHits(PARSER, term+"@2..5", s, 1); compareHits(PARSER, term+"@..6", s, 0, 1); compareHits(PARSER, term+"@2..", s, 1, 2); } //this should have no hits compareHits(PARSER, "zebar~>1@2..", s); //test spanOr compareHits(PARSER, "(foo bar)", s, 3, 4, 5); compareHits(PARSER, "[a (foo bar)]~@5..", s, 4, 5); compareHits(PARSER, "[a (foo bar)]~@13..", s, 5); compareHits(PARSER, "[a (foo bar)]~@5..13", s, 4); compareHits(PARSER, "[a (foo bar)]~@..5", s, 3); compareHits(PARSER, "[a (foo bar)]~@..13", s, 3, 4); //parser doesn't test for child ranges inconsistent with parent range compareHits(PARSER, "[a@100.. (foo bar)]~@..13", s); compareHits(PARSER, "(foo@13.. bar)", s, 3, 4, 5); compareHits(PARSER, "(foo@13.. bar@13..)", s, 5); compareHits(PARSER, "(foo@3..13 bar@3..13)", s, 4 ); compareHits(PARSER, "(foo@..13 bar@..13)", s, 3, 4); //Boolean needs to be SpanOr compareHits(PARSER, "(foo bar)@13..", s, 5); compareHits(PARSER, "(foo bar)@3..13", s, 4 ); compareHits(PARSER, "(foo bar)@..13", s, 3, 4); //minimum number can only apply to BooleanQuery //positionRange forces this to SpanOr. Do not allow! testException("(foo bar)~2@..13", PARSER); r.close(); dir.close(); } private void assertHits(String qString, SpanQueryParser p, IndexSearcher s, int expected) throws Exception { Query q = p.parse(qString); TopScoreDocCollector results = TopScoreDocCollector.create(1000); s.search(q, results); ScoreDoc[] scoreDocs = results.topDocs().scoreDocs; assertEquals(qString, expected, scoreDocs.length); } private static boolean isCharEscaped(String s, int i) { int j = i; int esc = 0; while (--j >=0 && s.charAt(j) == '\\') { esc++; } return esc % 2 != 0; } }