package org.apache.lucene.search.spans; /** * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.Collection; import java.util.HashSet; import java.util.Set; import java.util.Random; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.SlowMultiReaderWrapper; import org.apache.lucene.index.Payload; import org.apache.lucene.index.Term; import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Similarity; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.payloads.PayloadHelper; import org.apache.lucene.search.payloads.PayloadSpanUtil; import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.LuceneTestCase; public class TestPayloadSpans extends LuceneTestCase { private IndexSearcher searcher; private Similarity similarity = new DefaultSimilarity(); protected IndexReader indexReader; private IndexReader closeIndexReader; private Random rand; private Directory directory; public TestPayloadSpans(String s) { super(s); } @Override protected void setUp() throws Exception { super.setUp(); rand = newRandom(); PayloadHelper helper = new PayloadHelper(); searcher = helper.setUp(similarity, 1000); indexReader = searcher.getIndexReader(); } public void testSpanTermQuery() throws Exception { SpanTermQuery stq; Spans spans; stq = new SpanTermQuery(new Term(PayloadHelper.FIELD, "seventy")); spans = stq.getSpans(indexReader); assertTrue("spans is null and it shouldn't be", spans != null); checkSpans(spans, 100, 1, 1, 1); stq = new SpanTermQuery(new Term(PayloadHelper.NO_PAYLOAD_FIELD, "seventy")); spans = stq.getSpans(indexReader); assertTrue("spans is null and it shouldn't be", spans != null); checkSpans(spans, 100, 0, 0, 0); } public void testSpanFirst() throws IOException { SpanQuery match; SpanFirstQuery sfq; match = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one")); sfq = new SpanFirstQuery(match, 2); Spans spans = sfq.getSpans(indexReader); checkSpans(spans, 109, 1, 1, 1); //Test more complicated subclause SpanQuery[] clauses = new SpanQuery[2]; clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one")); clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "hundred")); match = new SpanNearQuery(clauses, 0, true); sfq = new SpanFirstQuery(match, 2); checkSpans(sfq.getSpans(indexReader), 100, 2, 1, 1); match = new SpanNearQuery(clauses, 0, false); sfq = new SpanFirstQuery(match, 2); checkSpans(sfq.getSpans(indexReader), 100, 2, 1, 1); } public void testSpanNot() throws Exception { SpanQuery[] clauses = new SpanQuery[2]; clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one")); clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "three")); SpanQuery spq = new SpanNearQuery(clauses, 5, true); SpanNotQuery snq = new SpanNotQuery(spq, new SpanTermQuery(new Term(PayloadHelper.FIELD, "two"))); Directory directory = newDirectory(rand); RandomIndexWriter writer = new RandomIndexWriter(rand, directory, newIndexWriterConfig(rand, TEST_VERSION_CURRENT, new PayloadAnalyzer()).setSimilarity(similarity)); Document doc = new Document(); doc.add(new Field(PayloadHelper.FIELD, "one two three one four three", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); IndexReader reader = writer.getReader(); writer.close(); checkSpans(snq.getSpans(SlowMultiReaderWrapper.wrap(reader)), 1,new int[]{2}); reader.close(); directory.close(); } public void testNestedSpans() throws Exception { SpanTermQuery stq; Spans spans; IndexSearcher searcher = getSearcher(); stq = new SpanTermQuery(new Term(PayloadHelper.FIELD, "mark")); spans = stq.getSpans(searcher.getIndexReader()); assertTrue("spans is null and it shouldn't be", spans != null); checkSpans(spans, 0, null); SpanQuery[] clauses = new SpanQuery[3]; clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "rr")); clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy")); clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx")); SpanNearQuery spanNearQuery = new SpanNearQuery(clauses, 12, false); spans = spanNearQuery.getSpans(searcher.getIndexReader()); assertTrue("spans is null and it shouldn't be", spans != null); checkSpans(spans, 2, new int[]{3,3}); clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx")); clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "rr")); clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy")); spanNearQuery = new SpanNearQuery(clauses, 6, true); spans = spanNearQuery.getSpans(searcher.getIndexReader()); assertTrue("spans is null and it shouldn't be", spans != null); checkSpans(spans, 1, new int[]{3}); clauses = new SpanQuery[2]; clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx")); clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "rr")); spanNearQuery = new SpanNearQuery(clauses, 6, true); // xx within 6 of rr SpanQuery[] clauses2 = new SpanQuery[2]; clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy")); clauses2[1] = spanNearQuery; SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses2, 6, false); // yy within 6 of xx within 6 of rr spans = nestedSpanNearQuery.getSpans(searcher.getIndexReader()); assertTrue("spans is null and it shouldn't be", spans != null); checkSpans(spans, 2, new int[]{3,3}); closeIndexReader.close(); directory.close(); } public void testFirstClauseWithoutPayload() throws Exception { Spans spans; IndexSearcher searcher = getSearcher(); SpanQuery[] clauses = new SpanQuery[3]; clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "nopayload")); clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "qq")); clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "ss")); SpanNearQuery spanNearQuery = new SpanNearQuery(clauses, 6, true); SpanQuery[] clauses2 = new SpanQuery[2]; clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "pp")); clauses2[1] = spanNearQuery; SpanNearQuery snq = new SpanNearQuery(clauses2, 6, false); SpanQuery[] clauses3 = new SpanQuery[2]; clauses3[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "np")); clauses3[1] = snq; SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses3, 6, false); spans = nestedSpanNearQuery.getSpans(searcher.getIndexReader()); assertTrue("spans is null and it shouldn't be", spans != null); checkSpans(spans, 1, new int[]{3}); closeIndexReader.close(); directory.close(); } public void testHeavilyNestedSpanQuery() throws Exception { Spans spans; IndexSearcher searcher = getSearcher(); SpanQuery[] clauses = new SpanQuery[3]; clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one")); clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "two")); clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "three")); SpanNearQuery spanNearQuery = new SpanNearQuery(clauses, 5, true); clauses = new SpanQuery[3]; clauses[0] = spanNearQuery; clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "five")); clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "six")); SpanNearQuery spanNearQuery2 = new SpanNearQuery(clauses, 6, true); SpanQuery[] clauses2 = new SpanQuery[2]; clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "eleven")); clauses2[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "ten")); SpanNearQuery spanNearQuery3 = new SpanNearQuery(clauses2, 2, false); SpanQuery[] clauses3 = new SpanQuery[3]; clauses3[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "nine")); clauses3[1] = spanNearQuery2; clauses3[2] = spanNearQuery3; SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses3, 6, false); spans = nestedSpanNearQuery.getSpans(searcher.getIndexReader()); assertTrue("spans is null and it shouldn't be", spans != null); checkSpans(spans, 2, new int[]{8, 8}); closeIndexReader.close(); directory.close(); } public void testShrinkToAfterShortestMatch() throws CorruptIndexException, LockObtainFailedException, IOException { Directory directory = newDirectory(rand); RandomIndexWriter writer = new RandomIndexWriter(rand, directory, newIndexWriterConfig(rand, TEST_VERSION_CURRENT, new TestPayloadAnalyzer())); Document doc = new Document(); doc.add(new Field("content", new StringReader("a b c d e f g h i j a k"))); writer.addDocument(doc); IndexReader reader = writer.getReader(); IndexSearcher is = new IndexSearcher(SlowMultiReaderWrapper.wrap(reader)); writer.close(); SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a")); SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k")); SpanQuery[] sqs = { stq1, stq2 }; SpanNearQuery snq = new SpanNearQuery(sqs, 1, true); Spans spans = snq.getSpans(is.getIndexReader()); TopDocs topDocs = is.search(snq, 1); Set<String> payloadSet = new HashSet<String>(); for (int i = 0; i < topDocs.scoreDocs.length; i++) { while (spans.next()) { Collection<byte[]> payloads = spans.getPayload(); for (final byte [] payload : payloads) { payloadSet.add(new String(payload)); } } } assertEquals(2, payloadSet.size()); assertTrue(payloadSet.contains("a:Noise:10")); assertTrue(payloadSet.contains("k:Noise:11")); reader.close(); directory.close(); } public void testShrinkToAfterShortestMatch2() throws CorruptIndexException, LockObtainFailedException, IOException { Directory directory = newDirectory(rand); RandomIndexWriter writer = new RandomIndexWriter(rand, directory, newIndexWriterConfig(rand, TEST_VERSION_CURRENT, new TestPayloadAnalyzer())); Document doc = new Document(); doc.add(new Field("content", new StringReader("a b a d k f a h i k a k"))); writer.addDocument(doc); IndexReader reader = writer.getReader(); IndexSearcher is = new IndexSearcher(SlowMultiReaderWrapper.wrap(reader)); writer.close(); SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a")); SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k")); SpanQuery[] sqs = { stq1, stq2 }; SpanNearQuery snq = new SpanNearQuery(sqs, 0, true); Spans spans = snq.getSpans(is.getIndexReader()); TopDocs topDocs = is.search(snq, 1); Set<String> payloadSet = new HashSet<String>(); for (int i = 0; i < topDocs.scoreDocs.length; i++) { while (spans.next()) { Collection<byte[]> payloads = spans.getPayload(); for (final byte[] payload : payloads) { payloadSet.add(new String(payload)); } } } assertEquals(2, payloadSet.size()); assertTrue(payloadSet.contains("a:Noise:10")); assertTrue(payloadSet.contains("k:Noise:11")); reader.close(); directory.close(); } public void testShrinkToAfterShortestMatch3() throws CorruptIndexException, LockObtainFailedException, IOException { Directory directory = newDirectory(rand); RandomIndexWriter writer = new RandomIndexWriter(rand, directory, newIndexWriterConfig(rand, TEST_VERSION_CURRENT, new TestPayloadAnalyzer())); Document doc = new Document(); doc.add(new Field("content", new StringReader("j k a l f k k p a t a k l k t a"))); writer.addDocument(doc); IndexReader reader = writer.getReader(); IndexSearcher is = new IndexSearcher(SlowMultiReaderWrapper.wrap(reader)); writer.close(); SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a")); SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k")); SpanQuery[] sqs = { stq1, stq2 }; SpanNearQuery snq = new SpanNearQuery(sqs, 0, true); Spans spans = snq.getSpans(is.getIndexReader()); TopDocs topDocs = is.search(snq, 1); Set<String> payloadSet = new HashSet<String>(); for (int i = 0; i < topDocs.scoreDocs.length; i++) { while (spans.next()) { Collection<byte[]> payloads = spans.getPayload(); for (final byte [] payload : payloads) { payloadSet.add(new String(payload)); } } } assertEquals(2, payloadSet.size()); if(VERBOSE) { for (final String payload : payloadSet) System.out.println("match:" + payload); } assertTrue(payloadSet.contains("a:Noise:10")); assertTrue(payloadSet.contains("k:Noise:11")); reader.close(); directory.close(); } public void testPayloadSpanUtil() throws Exception { Directory directory = newDirectory(rand); RandomIndexWriter writer = new RandomIndexWriter(rand, directory, newIndexWriterConfig(rand, TEST_VERSION_CURRENT, new PayloadAnalyzer()).setSimilarity(similarity)); Document doc = new Document(); doc.add(new Field(PayloadHelper.FIELD,"xx rr yy mm pp", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); IndexReader reader = writer.getReader(); writer.close(); IndexSearcher searcher = new IndexSearcher(SlowMultiReaderWrapper.wrap(reader)); PayloadSpanUtil psu = new PayloadSpanUtil(searcher.getIndexReader()); Collection<byte[]> payloads = psu.getPayloadsForQuery(new TermQuery(new Term(PayloadHelper.FIELD, "rr"))); if(VERBOSE) System.out.println("Num payloads:" + payloads.size()); for (final byte [] bytes : payloads) { if(VERBOSE) System.out.println(new String(bytes)); } reader.close(); directory.close(); } private void checkSpans(Spans spans, int expectedNumSpans, int expectedNumPayloads, int expectedPayloadLength, int expectedFirstByte) throws IOException { assertTrue("spans is null and it shouldn't be", spans != null); //each position match should have a span associated with it, since there is just one underlying term query, there should //only be one entry in the span int seen = 0; while (spans.next() == true) { //if we expect payloads, then isPayloadAvailable should be true if (expectedNumPayloads > 0) { assertTrue("isPayloadAvailable is not returning the correct value: " + spans.isPayloadAvailable() + " and it should be: " + (expectedNumPayloads > 0), spans.isPayloadAvailable() == true); } else { assertTrue("isPayloadAvailable should be false", spans.isPayloadAvailable() == false); } //See payload helper, for the PayloadHelper.FIELD field, there is a single byte payload at every token if (spans.isPayloadAvailable()) { Collection<byte[]> payload = spans.getPayload(); assertTrue("payload Size: " + payload.size() + " is not: " + expectedNumPayloads, payload.size() == expectedNumPayloads); for (final byte [] thePayload : payload) { assertTrue("payload[0] Size: " + thePayload.length + " is not: " + expectedPayloadLength, thePayload.length == expectedPayloadLength); assertTrue(thePayload[0] + " does not equal: " + expectedFirstByte, thePayload[0] == expectedFirstByte); } } seen++; } assertTrue(seen + " does not equal: " + expectedNumSpans, seen == expectedNumSpans); } private IndexSearcher getSearcher() throws Exception { directory = newDirectory(rand); String[] docs = new String[]{"xx rr yy mm pp","xx yy mm rr pp", "nopayload qq ss pp np", "one two three four five six seven eight nine ten eleven", "nine one two three four five six seven eight eleven ten"}; RandomIndexWriter writer = new RandomIndexWriter(rand, directory, newIndexWriterConfig(rand, TEST_VERSION_CURRENT, new PayloadAnalyzer()).setSimilarity(similarity)); Document doc = null; for(int i = 0; i < docs.length; i++) { doc = new Document(); String docText = docs[i]; doc.add(new Field(PayloadHelper.FIELD,docText, Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); } closeIndexReader = writer.getReader(); writer.close(); IndexSearcher searcher = new IndexSearcher(SlowMultiReaderWrapper.wrap(closeIndexReader)); return searcher; } private void checkSpans(Spans spans, int numSpans, int[] numPayloads) throws IOException { int cnt = 0; while (spans.next() == true) { if(VERBOSE) System.out.println("\nSpans Dump --"); if (spans.isPayloadAvailable()) { Collection<byte[]> payload = spans.getPayload(); if(VERBOSE) System.out.println("payloads for span:" + payload.size()); for (final byte [] bytes : payload) { if(VERBOSE) System.out.println("doc:" + spans.doc() + " s:" + spans.start() + " e:" + spans.end() + " " + new String(bytes)); } assertEquals(numPayloads[cnt],payload.size()); } else { assertFalse("Expected spans:" + numPayloads[cnt] + " found: 0",numPayloads.length > 0 && numPayloads[cnt] > 0 ); } cnt++; } assertEquals(numSpans, cnt); } final class PayloadAnalyzer extends Analyzer { @Override public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); result = new PayloadFilter(result, fieldName); return result; } } final class PayloadFilter extends TokenFilter { String fieldName; int numSeen = 0; Set<String> entities = new HashSet<String>(); Set<String> nopayload = new HashSet<String>(); int pos; PayloadAttribute payloadAtt; CharTermAttribute termAtt; PositionIncrementAttribute posIncrAtt; public PayloadFilter(TokenStream input, String fieldName) { super(input); this.fieldName = fieldName; pos = 0; entities.add("xx"); entities.add("one"); nopayload.add("nopayload"); nopayload.add("np"); termAtt = addAttribute(CharTermAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class); payloadAtt = addAttribute(PayloadAttribute.class); } @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { String token = termAtt.toString(); if (!nopayload.contains(token)) { if (entities.contains(token)) { payloadAtt.setPayload(new Payload((token + ":Entity:"+ pos ).getBytes())); } else { payloadAtt.setPayload(new Payload((token + ":Noise:" + pos ).getBytes())); } } pos += posIncrAtt.getPositionIncrement(); return true; } return false; } } public final class TestPayloadAnalyzer extends Analyzer { @Override public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); result = new PayloadFilter(result, fieldName); return result; } } }