/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.concordance;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.concordance.classic.AbstractConcordanceWindowCollector;
import org.apache.lucene.search.concordance.classic.ConcordanceSearcher;
import org.apache.lucene.search.concordance.classic.ConcordanceSortOrder;
import org.apache.lucene.search.concordance.classic.ConcordanceWindow;
import org.apache.lucene.search.concordance.classic.DocIdBuilder;
import org.apache.lucene.search.concordance.classic.DocMetadataExtractor;
import org.apache.lucene.search.concordance.classic.WindowBuilder;
import org.apache.lucene.search.concordance.classic.impl.ConcordanceWindowCollector;
import org.apache.lucene.search.concordance.classic.impl.DedupingConcordanceWindowCollector;
import org.apache.lucene.search.concordance.classic.impl.DefaultSortKeyBuilder;
import org.apache.lucene.search.concordance.classic.impl.IndexIdDocIdBuilder;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
public class TestConcordanceSearcher extends ConcordanceTestBase {
private final static DocMetadataExtractor metadataExtractor =
new DocMetadataExtractor() {
private final Set<String> fields = new HashSet<>();
private final Map<String, String> data = new HashMap<>();
@Override
public Set<String> getFieldSelector() {
return fields;
}
@Override
public Map<String, String> extract(Document d) {
return data;
}
};
private final static DocIdBuilder docIdBuilder = new IndexIdDocIdBuilder();
@BeforeClass
public static void beforeClass() throws Exception {
// NOOP for now
}
@AfterClass
public static void afterClass() throws Exception {
// NOOP for now
}
@Test
public void testSimple() throws Exception {
String[] docs = new String[]{"a b c a b c", "c b a c b a"};
Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
Directory directory = getDirectory(analyzer, docs);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(reader);
WindowBuilder wb = new WindowBuilder(10, 10,
analyzer.getOffsetGap(FIELD),
new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE), metadataExtractor, docIdBuilder);
ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
SpanQuery q = new SpanTermQuery(new Term(FIELD, "a"));
ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3);
searcher.search(indexSearcher, FIELD,
q, null, analyzer, collector);
assertEquals(3, collector.size());
collector = new ConcordanceWindowCollector(ConcordanceWindowCollector.COLLECT_ALL);
searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
// test result size
assertEquals(4, collector.size());
// test result with sort order = pre
List<ConcordanceWindow> windows = collector.getSortedWindows();
String[] pres = new String[]{"", "c b", "c b a c b", "a b c"};
String[] posts = new String[]{" b c a b c", " c b a", "", " b c"};
for (int i = 0; i < windows.size(); i++) {
ConcordanceWindow w = windows.get(i);
assertEquals(pres[i], w.getPre());
assertEquals(posts[i], w.getPost());
}
// test sort order post
// sort key is built at search time, so must re-search
wb = new WindowBuilder(10, 10,
analyzer.getOffsetGap(FIELD),
new DefaultSortKeyBuilder(ConcordanceSortOrder.POST), metadataExtractor, docIdBuilder);
searcher = new ConcordanceSearcher(wb);
collector = new ConcordanceWindowCollector(ConcordanceWindowCollector.COLLECT_ALL);
searcher.search(indexSearcher, FIELD, q,
null, analyzer, collector);
windows = collector.getSortedWindows();
posts = new String[]{"", " b c", " b c a b c", " c b a",};
for (int i = 0; i < windows.size(); i++) {
ConcordanceWindow w = windows.get(i);
assertEquals(posts[i], w.getPost());
}
reader.close();
directory.close();
}
@Test
public void testSimpleMultiValuedField() throws Exception {
String[] doc = new String[]{"a b c a b c", "c b a c b a"};
List<String[]> docs = new ArrayList<>();
docs.add(doc);
Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
Directory directory = getDirectory(analyzer, docs);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(reader);
ConcordanceSearcher searcher = new ConcordanceSearcher(
new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
SpanQuery q = new SpanTermQuery(new Term(FIELD, "a"));
ConcordanceWindowCollector collector = new ConcordanceWindowCollector(100);
searcher.search(indexSearcher, FIELD,
q, null, analyzer, collector);
// test result size
assertEquals(4, collector.size());
// test result with sort order = pre
List<ConcordanceWindow> windows = collector.getSortedWindows();
String[] pres = new String[]{"", "c b", "c b a c b", "a b c"};
String[] posts = new String[]{" b c a b c", " c b a", "", " b c"};
for (int i = 0; i < pres.length; i++) {
ConcordanceWindow w = windows.get(i);
assertEquals("pres: " + i, pres[i], w.getPre());
assertEquals("posts: " + i, posts[i], w.getPost());
}
// test sort order post
// sort key is built at search time, so must re-search
WindowBuilder wb = new WindowBuilder(10, 10,
analyzer.getOffsetGap(FIELD),
new DefaultSortKeyBuilder(ConcordanceSortOrder.POST), metadataExtractor, docIdBuilder);
searcher = new ConcordanceSearcher(wb);
collector = new ConcordanceWindowCollector(100);
searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
windows = collector.getSortedWindows();
posts = new String[]{"", " b c", " b c a b c", " c b a",};
for (int i = 0; i < posts.length; i++) {
ConcordanceWindow w = windows.get(i);
assertEquals(posts[i], w.getPost());
}
reader.close();
directory.close();
}
@Test
public void testWindowLengths() throws Exception {
String[] doc = new String[]{"a b c d e f g"};
List<String[]> docs = new ArrayList<>();
docs.add(doc);
Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
Directory directory = getDirectory(analyzer, docs);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(reader);
SpanQuery q = new SpanTermQuery(new Term(FIELD, "d"));
String[] pres = {"", "c", "b c", "a b c", "a b c", "a b c"};
String[] posts = {"", " e", " e f", " e f g", " e f g", " e f g"};
for (int tokensBefore = 0; tokensBefore < pres.length; tokensBefore++) {
for (int tokensAfter = 0; tokensAfter < posts.length; tokensAfter++) {
WindowBuilder wb = new WindowBuilder(tokensBefore, tokensAfter,
analyzer.getOffsetGap(FIELD));
ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
ConcordanceWindowCollector collector = new ConcordanceWindowCollector(100);
searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
ConcordanceWindow w = collector.getSortedWindows().get(0);
assertEquals(tokensBefore + " : " + tokensAfter, pres[tokensBefore], w.getPre());
assertEquals(tokensBefore + " : " + tokensAfter, posts[tokensAfter], w.getPost());
}
}
reader.close();
directory.close();
}
@Test
public void testClockworkOrangeMultiValuedFieldProblem() throws Exception {
/*
* test handling of target match (or not) over different indices into multivalued
* field array
*/
String[] doc = new String[]{"a b c a b the", "clockwork",
"orange b a c b a"};
List<String[]> docs = new ArrayList<>();
docs.add(doc);
Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 0, 10);
Directory directory = getDirectory(analyzer, docs);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(reader);
WindowBuilder wb = new WindowBuilder(3, 3, analyzer.getOffsetGap(FIELD));
ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
SpanQuery q1 = new SpanTermQuery(
new Term(FIELD, "the"));
SpanQuery q2 = new SpanTermQuery(new Term(FIELD,
"clockwork"));
SpanQuery q3 = new SpanTermQuery(new Term(FIELD,
"orange"));
SpanQuery q = new SpanNearQuery(new SpanQuery[]{q1, q2, q3}, 3, true);
ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3);
searcher.search(indexSearcher, FIELD,
q, null, analyzer, collector);
assertEquals(1, collector.size());
ConcordanceWindow w = collector.getSortedWindows().iterator().next();
assertEquals("target", "the | clockwork | orange", w.getTarget());
assertEquals("pre", "c a b", w.getPre());
assertEquals("post", " b a c", w.getPost());
reader.close();
directory.close();
// test hit even over long inter-field gap
analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 20, 50);
directory = getDirectory(analyzer, docs);
reader = DirectoryReader.open(directory);
indexSearcher = new IndexSearcher(reader);
wb = new WindowBuilder(3, 3, analyzer.getOffsetGap(FIELD));
searcher = new ConcordanceSearcher(wb);
q = new SpanNearQuery(new SpanQuery[]{q1, q2, q3}, 120, true);
collector = new ConcordanceWindowCollector(100);
searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
assertEquals(1, collector.size());
w = collector.getSortedWindows().iterator().next();
assertEquals("target", "the | clockwork | orange", w.getTarget());
assertEquals("pre", "c a b", w.getPre());
assertEquals("post", " b a c", w.getPost());
reader.close();
directory.close();
// test miss
analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 100, 100);
directory = getDirectory(analyzer, docs);
reader = DirectoryReader.open(directory);
indexSearcher = new IndexSearcher(reader);
wb = new WindowBuilder();
searcher = new ConcordanceSearcher(wb);
q = new SpanNearQuery(new SpanQuery[]{q1, q2, q3}, 5, true);
collector = new ConcordanceWindowCollector(100);
searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
assertEquals(0, collector.size());
reader.close();
directory.close();
}
@Test
public void testWithStops() throws Exception {
String[] docs = new String[]{"a b the d e the f", "g h the d the j"};
Analyzer analyzer = getAnalyzer(MockTokenFilter.ENGLISH_STOPSET);
Directory directory = getDirectory(analyzer, docs);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(reader);
WindowBuilder wb = new WindowBuilder(2, 2, analyzer.getOffsetGap(FIELD));
ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
SpanQuery q = new SpanTermQuery(new Term(FIELD, "d"));
ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3);
searcher.search(indexSearcher, FIELD,
q, null, analyzer, collector);
List<ConcordanceWindow> windows = collector.getSortedWindows();
assertEquals(2, windows.size());
// the second word after the target is a stop word
// this post-component of this window should only go to the first word after
// the target
assertEquals("b the", windows.get(0).getPre());
assertEquals("d", windows.get(0).getTarget());
assertEquals(" e", windows.get(0).getPost());
assertEquals("h the", windows.get(1).getPre());
assertEquals("d", windows.get(1).getTarget());
assertEquals(" the j", windows.get(1).getPost());
reader.close();
directory.close();
}
@Test
public void testBasicStandardQueryConversion() throws Exception {
String[] docs = new String[]{"a b c a b c", "c b a c b a d e a",
"c b a c b a e a b c a"};
Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
Directory directory = getDirectory(analyzer, docs);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(reader);
ConcordanceSearcher searcher = new ConcordanceSearcher(
new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
BooleanQuery q = new BooleanQuery.Builder()
.add(new TermQuery(new Term(FIELD, "a")), Occur.MUST)
.add(new TermQuery(new Term(FIELD, "d")),
Occur.MUST_NOT).build();
ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);
searcher.search(indexSearcher,
FIELD, q, null,
analyzer, collector);
// shouldn't include document with "d"
assertEquals(6, collector.size());
// should only include document with "e" and not "d"
Query filter = new TermQuery(new Term(
FIELD, "e"));
collector = new ConcordanceWindowCollector(10);
searcher.search(indexSearcher, FIELD, (Query) q, filter, analyzer, collector);
assertEquals(4, collector.size());
reader.close();
directory.close();
}
@Test
public void testMismatchingFieldsInStandardQueryConversion() throws Exception {
// tests what happens if a Query doesn't contain a term in the "span" field
// in the searcher...should be no exception and zero documents returned.
String[] docs = new String[]{"a b c a b c",};
Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
Directory directory = getDirectory(analyzer, docs);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(reader);
ConcordanceSearcher searcher = new ConcordanceSearcher(
new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
Query q = new TermQuery(new Term("_" + FIELD, "a"));
int windowCount = -1;
ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);
searcher.search(indexSearcher, FIELD,
q, null, analyzer, collector);
windowCount = collector.size();
assertEquals(0, windowCount);
reader.close();
directory.close();
}
@Test
public void testUniqueCollector() throws Exception {
String[] docs = new String[]{"a b c d c b a",
"a B C d c b a",
"a b C d C B a",
"a b c d C B A",
"e f g d g f e",
"h i j d j i h"
};
Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
Directory directory = getDirectory(analyzer, docs);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(reader);
ConcordanceSearcher searcher = new ConcordanceSearcher(
new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
SpanQuery q = new SpanTermQuery(new Term(FIELD, "d"));
DedupingConcordanceWindowCollector collector = new DedupingConcordanceWindowCollector(2);
searcher.search(indexSearcher,
FIELD, (Query) q, null,
analyzer, collector);
assertEquals(2, collector.size());
collector =
new DedupingConcordanceWindowCollector(AbstractConcordanceWindowCollector.COLLECT_ALL);
searcher.search(indexSearcher,
FIELD, (Query) q, null,
analyzer, collector);
assertEquals(3, collector.size());
reader.close();
directory.close();
}
@Test
public void testUniqueCollectorWithSameWindowOverflow() throws Exception {
String[] docs = new String[]{"a b c d c b a",
"a b c d c b a",
"a b c d c b a",
"a b c d c b a",
"e f g d g f e",
"h i j d j i h"
};
Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
Directory directory = getDirectory(analyzer, docs);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(reader);
ConcordanceSearcher searcher = new ConcordanceSearcher(
new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
SpanQuery q = new SpanTermQuery(new Term(FIELD, "d"));
DedupingConcordanceWindowCollector collector = new DedupingConcordanceWindowCollector(3);
searcher.search(indexSearcher,
FIELD, (Query) q, null,
analyzer, collector);
assertEquals(3, collector.size());
assertEquals(4, collector.getSortedWindows().get(0).getCount());
reader.close();
directory.close();
}
@Test
public void testAllowTargetOverlaps() throws Exception {
String[] docs = new String[]{"a b c"};
Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
Directory directory = getDirectory(analyzer, docs);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(reader);
WindowBuilder wb = new WindowBuilder(10, 10,
analyzer.getOffsetGap(FIELD),
new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE), metadataExtractor, docIdBuilder);
ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
SpanQuery term = new SpanTermQuery(new Term(FIELD, "a"));
SpanQuery phrase = new SpanNearQuery(
new SpanQuery[]{
new SpanTermQuery(new Term(FIELD, "a")),
new SpanTermQuery(new Term(FIELD, "b"))
}, 0, true);
SpanOrQuery q = new SpanOrQuery(
new SpanQuery[]{
term,
phrase
}
);
ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);
searcher.search(indexSearcher, FIELD,
q, null, analyzer, collector);
//default should be: don't allow target overlaps
assertEquals(1, collector.size());
searcher.setAllowTargetOverlaps(true);
collector = new ConcordanceWindowCollector(10);
searcher.search(indexSearcher, FIELD,
q, null, analyzer, collector);
//now there should be two windows with allowTargetOverlaps = true
assertEquals(2, collector.size());
reader.close();
directory.close();
}
@Test
public void testRewrites() throws Exception {
//test to make sure that queries are rewritten
//first test straight prefix queries
String[] docs = new String[]{"aa ba ca aa ba ca", "ca ba aa ca ba aa da ea za",
"ca ba aa ca ba aa ea aa ba ca za"};
Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
Directory directory = getDirectory(analyzer, docs);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(reader);
ConcordanceSearcher searcher = new ConcordanceSearcher(
new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
BooleanQuery q = new BooleanQuery.Builder()
.add(new PrefixQuery(new Term(FIELD, "a")), Occur.MUST)
.add(new PrefixQuery(new Term(FIELD, "d")),
Occur.MUST_NOT).build();
//now test straight and span wrapper
ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);
searcher.search(indexSearcher,
FIELD, q, new PrefixQuery(new Term(FIELD, "z")),
analyzer, collector);
// shouldn't include document with "da", but must include one with za
assertEquals(3, collector.size());
collector = new ConcordanceWindowCollector(10);
searcher.search(indexSearcher,
FIELD, q, new SpanMultiTermQueryWrapper<>(new PrefixQuery(new Term(FIELD, "z"))),
analyzer, collector);
// shouldn't include document with "da", but must include one with za
assertEquals(3, collector.size());
reader.close();
directory.close();
}
}