package lia.advsearching; /** * Copyright Manning Publications Co. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific lan */ import junit.framework.TestCase; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Filter; import org.apache.lucene.search.SpanQueryFilter; import org.apache.lucene.search.spans.SpanFirstQuery; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanNotQuery; import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.Spans; import org.apache.lucene.store.RAMDirectory; import java.io.IOException; import java.io.StringReader; // From chapter 5 public class SpanQueryTest extends TestCase { private RAMDirectory directory; private IndexSearcher searcher; private IndexReader reader; private SpanTermQuery quick; private SpanTermQuery brown; private SpanTermQuery red; private SpanTermQuery fox; private SpanTermQuery lazy; private SpanTermQuery sleepy; private SpanTermQuery dog; private SpanTermQuery cat; private Analyzer analyzer; protected void setUp() throws Exception { directory = new RAMDirectory(); analyzer = new WhitespaceAnalyzer(); IndexWriter writer = new IndexWriter(directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED); Document doc = new Document(); doc.add(new Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); doc = new Document(); doc.add(new Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); writer.close(); searcher = new IndexSearcher(directory); reader = searcher.getIndexReader(); quick = new SpanTermQuery(new Term("f", "quick")); brown = new SpanTermQuery(new Term("f", "brown")); red = new SpanTermQuery(new Term("f", "red")); fox = new SpanTermQuery(new Term("f", "fox")); lazy = new SpanTermQuery(new Term("f", "lazy")); sleepy = new SpanTermQuery(new Term("f", "sleepy")); dog = new SpanTermQuery(new Term("f", "dog")); cat = new SpanTermQuery(new Term("f", "cat")); } private void assertOnlyBrownFox(Query query) throws Exception { TopDocs hits = searcher.search(query, 10); assertEquals(1, hits.totalHits); assertEquals("wrong doc", 0, hits.scoreDocs[0].doc); } private void assertBothFoxes(Query query) throws Exception { TopDocs hits = searcher.search(query, 10); assertEquals(2, hits.totalHits); } private void assertNoMatches(Query query) throws Exception { TopDocs hits = searcher.search(query, 10); assertEquals(0, hits.totalHits); } public void testSpanTermQuery() throws Exception { assertOnlyBrownFox(brown); dumpSpans(brown); } public void testSpanFirstQuery() throws Exception { SpanFirstQuery sfq = new SpanFirstQuery(brown, 2); assertNoMatches(sfq); dumpSpans(sfq); sfq = new SpanFirstQuery(brown, 3); dumpSpans(sfq); assertOnlyBrownFox(sfq); } public void testSpanNearQuery() throws Exception { SpanQuery[] quick_brown_dog = new SpanQuery[]{quick, brown, dog}; SpanNearQuery snq = new SpanNearQuery(quick_brown_dog, 0, true); // #1 assertNoMatches(snq); dumpSpans(snq); snq = new SpanNearQuery(quick_brown_dog, 4, true); // #2 assertNoMatches(snq); dumpSpans(snq); snq = new SpanNearQuery(quick_brown_dog, 5, true); // #3 assertOnlyBrownFox(snq); dumpSpans(snq); // interesting - even a sloppy phrase query would require // more slop to match snq = new SpanNearQuery(new SpanQuery[]{lazy, fox}, 3, false);// #4 assertOnlyBrownFox(snq); dumpSpans(snq); PhraseQuery pq = new PhraseQuery(); // #5 pq.add(new Term("f", "lazy")); // #5 pq.add(new Term("f", "fox")); // #5 pq.setSlop(4); // #5 assertNoMatches(pq); pq.setSlop(5); // #6 assertOnlyBrownFox(pq); // #6 } /* #1 Query for three successive terms #2 Same terms, slop of 4 #3 SpanNearQuery matches #4 Nested SpanTermQuery objects in reverse order #5 Comparable PhraseQuery #6 PhraseQuery, slop of 5 */ public void testSpanQueryFilter() throws Exception { SpanQuery[] quick_brown_dog = new SpanQuery[]{quick, brown, dog}; SpanQuery snq = new SpanNearQuery(quick_brown_dog, 5, true); Filter filter = new SpanQueryFilter(snq); Query query = new MatchAllDocsQuery(); TopDocs hits = searcher.search(query, filter, 10); assertEquals(1, hits.totalHits); assertEquals("wrong doc", 0, hits.scoreDocs[0].doc); } public void testSpanNotQuery() throws Exception { SpanNearQuery quick_fox = new SpanNearQuery(new SpanQuery[]{quick, fox}, 1, true); assertBothFoxes(quick_fox); dumpSpans(quick_fox); SpanNotQuery quick_fox_dog = new SpanNotQuery(quick_fox, dog); assertBothFoxes(quick_fox_dog); dumpSpans(quick_fox_dog); SpanNotQuery no_quick_red_fox = new SpanNotQuery(quick_fox, red); assertOnlyBrownFox(no_quick_red_fox); dumpSpans(no_quick_red_fox); } public void testSpanOrQuery() throws Exception { SpanNearQuery quick_fox = new SpanNearQuery(new SpanQuery[]{quick, fox}, 1, true); SpanNearQuery lazy_dog = new SpanNearQuery(new SpanQuery[]{lazy, dog}, 0, true); SpanNearQuery sleepy_cat = new SpanNearQuery(new SpanQuery[]{sleepy, cat}, 0, true); SpanNearQuery qf_near_ld = new SpanNearQuery( new SpanQuery[]{quick_fox, lazy_dog}, 3, true); assertOnlyBrownFox(qf_near_ld); dumpSpans(qf_near_ld); SpanNearQuery qf_near_sc = new SpanNearQuery( new SpanQuery[]{quick_fox, sleepy_cat}, 3, true); dumpSpans(qf_near_sc); SpanOrQuery or = new SpanOrQuery( new SpanQuery[]{qf_near_ld, qf_near_sc}); assertBothFoxes(or); dumpSpans(or); } public void testPlay() throws Exception { SpanOrQuery or = new SpanOrQuery(new SpanQuery[]{quick, fox}); dumpSpans(or); SpanNearQuery quick_fox = new SpanNearQuery(new SpanQuery[]{quick, fox}, 1, true); SpanFirstQuery sfq = new SpanFirstQuery(quick_fox, 4); dumpSpans(sfq); dumpSpans(new SpanTermQuery(new Term("f", "the"))); SpanNearQuery quick_brown = new SpanNearQuery(new SpanQuery[]{quick, brown}, 0, false); dumpSpans(quick_brown); } private void dumpSpans(SpanQuery query) throws IOException { Spans spans = query.getSpans(reader); System.out.println(query + ":"); int numSpans = 0; TopDocs hits = searcher.search(query, 10); float[] scores = new float[2]; for (ScoreDoc sd : hits.scoreDocs) { scores[sd.doc] = sd.score; } while (spans.next()) { // A numSpans++; int id = spans.doc(); Document doc = reader.document(id); // B TokenStream stream = analyzer.tokenStream("contents", // C new StringReader(doc.get("f"))); // C TermAttribute term = stream.addAttribute(TermAttribute.class); StringBuilder buffer = new StringBuilder(); buffer.append(" "); int i = 0; while(stream.incrementToken()) { // D if (i == spans.start()) { // E buffer.append("<"); // E } // E buffer.append(term.term()); // E if (i + 1 == spans.end()) { // E buffer.append(">"); // E } // E buffer.append(" "); i++; } buffer.append("(").append(scores[id]).append(") "); System.out.println(buffer); } if (numSpans == 0) { System.out.println(" No spans"); } System.out.println(); } // A Step through each span // B Retrieve document // C Re-analyze text // D Step through all tokens // E Print < and > around span }