/** * Copyright 2014 National University of Ireland, Galway. * * This file is part of the SIREn project. Project and contact information: * * https://github.com/rdelbru/SIREn * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sindice.siren.index.codecs.siren10; import static org.sindice.siren.analysis.MockSirenDocument.doc; import static org.sindice.siren.analysis.MockSirenToken.node; import static org.sindice.siren.analysis.MockSirenToken.token; import java.io.IOException; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.SlowCompositeReaderWrapper; import org.apache.lucene.index.Term; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.junit.Test; import org.sindice.siren.analysis.MockSirenDocument; import org.sindice.siren.index.DocsAndNodesIterator; import org.sindice.siren.index.codecs.RandomSirenCodec.PostingsFormatType; import org.sindice.siren.index.codecs.siren10.Siren10PostingsReader.Siren10DocsEnum; import org.sindice.siren.index.codecs.siren10.Siren10PostingsReader.Siren10DocsNodesAndPositionsEnum; import org.sindice.siren.util.BasicSirenTestCase; public class TestSiren10PostingsFormat extends BasicSirenTestCase { @Override protected void configure() throws IOException { this.setAnalyzer(AnalyzerType.MOCK); this.setPostingsFormat(PostingsFormatType.SIREN_10); } @Test public void testSimpleNextDocument() throws IOException { this.addDocuments( doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2))), doc(token("aaa", node(1,0)), token("bbb", node(1,0,1,0))), doc(token("aaa", node(5,3,6,3)), token("bbb", node(5,3,6,3,7))) ); final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader); final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, "aaa")); assertTrue(docsEnum instanceof Siren10DocsEnum); final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum(); assertEquals(-1, e.doc()); assertEquals(0, e.nodeFreqInDoc()); assertTrue(e.nextDocument()); assertEquals(0, e.doc()); assertEquals(2, e.nodeFreqInDoc()); assertTrue(e.nextDocument()); assertEquals(1, e.doc()); assertEquals(1, e.nodeFreqInDoc()); assertTrue(e.nextDocument()); assertEquals(2, e.doc()); assertEquals(1, e.nodeFreqInDoc()); assertFalse(e.nextDocument()); assertEquals(DocsAndNodesIterator.NO_MORE_DOC, e.doc()); } @Test public void testSkipDoc() throws IOException { final MockSirenDocument[] docs = new MockSirenDocument[2048]; for (int i = 0; i < 2048; i += 4) { docs[i] = doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2))); docs[i + 1] = doc(token("aaa", node(1,0)), token("bbb", node(1,0,1,0))); docs[i + 2] = doc(token("aaa", node(5,3,6,3)), token("bbb", node(5,3,6,3,7))); docs[i + 3] = doc(token("bbb", node(2,0)), token("aaa", node(5,3,6))); } this.addDocuments(docs); final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader); final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa"))); assertTrue(docsEnum instanceof Siren10DocsEnum); final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum(); // first skip in skiplist is at 512 assertTrue(e.skipTo(502)); assertEquals(502, e.doc()); assertEquals(1, e.nodeFreqInDoc()); // must have used the second skip assertTrue(e.skipTo(1624)); assertEquals(1624, e.doc()); assertEquals(2, e.nodeFreqInDoc()); // no other skip, must have used the linear scan assertTrue(e.skipTo(2000)); assertEquals(2000, e.doc()); assertEquals(2, e.nodeFreqInDoc()); assertFalse(e.skipTo(256323)); } @Test public void testSimpleNextNode() throws IOException { this.addDocuments( doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2))), doc(token("aaa", node(1,0)), token("bbb", node(1,0,1,0))), doc(token("aaa", node(5,3,6,3)), token("bbb", node(5,3,6,3,7))) ); final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader); final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa"))); assertTrue(docsEnum instanceof Siren10DocsEnum); final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum(); assertEquals(-1, e.doc()); assertEquals(0, e.nodeFreqInDoc()); assertEquals(node(-1), e.node()); assertTrue(e.nextDocument()); assertEquals(0, e.doc()); assertEquals(2, e.nodeFreqInDoc()); assertTrue(e.nextNode()); assertEquals(node(1), e.node()); assertTrue(e.nextNode()); assertEquals(node(2), e.node()); assertFalse(e.nextNode()); assertEquals(DocsAndNodesIterator.NO_MORE_NOD, e.node()); assertTrue(e.nextDocument()); assertEquals(1, e.doc()); assertEquals(1, e.nodeFreqInDoc()); assertTrue(e.nextNode()); assertEquals(node(1,0), e.node()); assertFalse(e.nextNode()); assertEquals(DocsAndNodesIterator.NO_MORE_NOD, e.node()); assertTrue(e.nextDocument()); assertEquals(2, e.doc()); assertEquals(1, e.nodeFreqInDoc()); assertTrue(e.nextNode()); assertEquals(node(5,3,6,3), e.node()); assertFalse(e.nextNode()); assertEquals(DocsAndNodesIterator.NO_MORE_NOD, e.node()); assertFalse(e.nextDocument()); assertEquals(DocsAndNodesIterator.NO_MORE_DOC, e.doc()); } @Test public void testSimpleSkipNode() throws IOException { this.addDocuments( doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2))), doc(token("aaa", node(1,0)), token("bbb", node(1,0,1,0))), doc(token("aaa", node(5,3,6,3)), token("bbb", node(5,3,6,3,7))) ); final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader); final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa"))); assertTrue(docsEnum instanceof Siren10DocsEnum); final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum(); assertEquals(-1, e.doc()); assertEquals(0, e.nodeFreqInDoc()); // skip to 2 using linear scan. Node should be also be skipped. assertTrue(e.skipTo(2)); assertEquals(2, e.doc()); assertEquals(1, e.nodeFreqInDoc()); assertTrue(e.nextNode()); assertEquals(node(5,3,6,3), e.node()); assertFalse(e.nextNode()); assertFalse(e.nextDocument()); } @Test public void testSkipNode() throws IOException { final MockSirenDocument[] docs = new MockSirenDocument[2048]; for (int i = 0; i < 2048; i += 4) { docs[i] = doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2))); docs[i + 1] = doc(token("aaa", node(1,0)), token("bbb", node(1,0,1,0))); docs[i + 2] = doc(token("aaa", node(5,3,6,3)), token("bbb", node(5,3,6,3,7))); docs[i + 3] = doc(token("bbb", node(2,0)), token("aaa", node(5,3,6))); } this.addDocuments(docs); final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader); final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa"))); assertTrue(docsEnum instanceof Siren10DocsEnum); final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum(); // first skip in skiplist is at 512 assertTrue(e.skipTo(502)); assertEquals(502, e.doc()); assertEquals(1, e.nodeFreqInDoc()); assertTrue(e.nextNode()); assertEquals(node(5,3,6,3), e.node()); assertFalse(e.nextNode()); // skip to 504 and scan partially nodes assertTrue(e.nextDocument()); assertTrue(e.nextDocument()); assertEquals(504, e.doc()); assertEquals(2, e.nodeFreqInDoc()); assertTrue(e.nextNode()); assertEquals(node(1), e.node()); // must have used the second skip assertTrue(e.skipTo(1624)); assertEquals(1624, e.doc()); assertEquals(2, e.nodeFreqInDoc()); assertTrue(e.nextNode()); assertEquals(node(1), e.node()); assertTrue(e.nextNode()); assertEquals(node(2), e.node()); assertFalse(e.nextNode()); // no other skip, must have used the linear scan assertTrue(e.skipTo(2000)); assertEquals(2000, e.doc()); assertEquals(2, e.nodeFreqInDoc()); assertTrue(e.nextNode()); assertEquals(node(1), e.node()); assertTrue(e.nextNode()); assertEquals(node(2), e.node()); assertFalse(e.nextNode()); assertFalse(e.skipTo(256323)); } @Test public void testSimpleNextPosition() throws IOException { this.addDocuments( doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2))), doc(token("bbb", node(1,0)), token("bbb", node(1,0,1,0))), doc(token("bbb", node(5,3,6)), token("aaa", node(5,3,6,3)), token("aaa", node(5,3,6,3))) ); final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader); final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa"))); assertTrue(docsEnum instanceof Siren10DocsEnum); final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum(); assertEquals(-1, e.doc()); assertEquals(0, e.nodeFreqInDoc()); assertEquals(node(-1), e.node()); assertEquals(-1, e.pos()); assertTrue(e.nextDocument()); assertEquals(0, e.doc()); assertEquals(2, e.nodeFreqInDoc()); assertTrue(e.nextNode()); assertEquals(node(1), e.node()); assertEquals(1, e.termFreqInNode()); assertTrue(e.nextPosition()); assertEquals(0, e.pos()); assertFalse(e.nextPosition()); assertTrue(e.nextNode()); assertEquals(node(2), e.node()); assertEquals(1, e.termFreqInNode()); assertTrue(e.nextPosition()); assertEquals(0, e.pos()); assertFalse(e.nextPosition()); assertFalse(e.nextNode()); assertTrue(e.nextDocument()); assertEquals(2, e.doc()); assertEquals(1, e.nodeFreqInDoc()); assertTrue(e.nextNode()); assertEquals(node(5,3,6,3), e.node()); assertEquals(2, e.termFreqInNode()); assertTrue(e.nextPosition()); assertEquals(0, e.pos()); assertTrue(e.nextPosition()); assertEquals(1, e.pos()); assertFalse(e.nextPosition()); assertFalse(e.nextNode()); assertFalse(e.nextDocument()); } @Test public void testSimpleFrequencies() throws IOException { this.addDocuments( doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2))), doc(token("aaa", node(1)), token("aaa", node(1)), token("aaa", node(2))) ); final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader); final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa"))); assertTrue(docsEnum instanceof Siren10DocsEnum); final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum(); assertEquals(-1, e.doc()); // freqs should be set to 0 at the beginning assertEquals(0, e.nodeFreqInDoc()); assertEquals(0, e.termFreqInNode()); // nodeFreqInDoc should be set after calling nextDocument assertTrue(e.nextDocument()); assertEquals(2, e.nodeFreqInDoc()); // termFreqInNode should be set to 0 assertEquals(0, e.termFreqInNode()); // calling termFreqInNode should not change the freq settings assertEquals(2, e.nodeFreqInDoc()); // termFreqInNode should be set after calling nextNode assertTrue(e.nextNode()); // nodeFreqInDoc and nodeFreqInDoc should not have changed of settings assertEquals(2, e.nodeFreqInDoc()); // termFreqInNode should be set to 1 assertEquals(1, e.termFreqInNode()); // calling termFreqInNode should not change the freqs settings assertEquals(2, e.nodeFreqInDoc()); // calling nextPosition should not change freqs settings assertTrue(e.nextPosition()); assertEquals(2, e.nodeFreqInDoc()); assertEquals(1, e.termFreqInNode()); // partially scanned position should not have consequences on nodeFreqInDoc // settings assertTrue(e.nextDocument()); assertEquals(2, e.nodeFreqInDoc()); assertTrue(e.nextNode()); assertEquals(2, e.termFreqInNode()); assertTrue(e.nextPosition()); assertEquals(2, e.termFreqInNode()); assertTrue(e.nextNode()); assertEquals(1, e.termFreqInNode()); } @Test public void testSimpleMerge() throws IOException { this.addDocuments( doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2))) ); this.addDocuments( doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2))) ); this.forceMerge(); final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader); final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa"))); assertTrue(docsEnum instanceof Siren10DocsEnum); final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum(); assertTrue(e.nextDocument()); assertEquals(0, e.doc()); assertEquals(2, e.nodeFreqInDoc()); assertTrue(e.nextNode()); assertEquals(1, e.termFreqInNode()); assertTrue(e.nextPosition()); assertEquals(0, e.pos()); assertTrue(e.nextNode()); assertEquals(1, e.termFreqInNode()); assertTrue(e.nextPosition()); assertEquals(0, e.pos()); assertTrue(e.nextDocument()); assertEquals(1, e.doc()); assertEquals(2, e.nodeFreqInDoc()); assertTrue(e.nextNode()); assertEquals(1, e.termFreqInNode()); assertTrue(e.nextPosition()); assertEquals(0, e.pos()); assertTrue(e.nextNode()); assertEquals(1, e.termFreqInNode()); assertTrue(e.nextPosition()); assertEquals(0, e.pos()); } @Test public void testMergeBlockSize() throws IOException { // reduce block size this.setPostingsFormat(new Siren10VIntPostingsFormat(2)); this.addDocuments( doc(token("aaa", node(1)), token("bbb", node(1,0))), doc(token("aaa", node(1)), token("bbb", node(1,0))), doc(token("aaa", node(1)), token("bbb", node(1,0))) ); this.addDocuments( doc(token("aaa", node(1)), token("bbb", node(1,0))), doc(token("aaa", node(1)), token("bbb", node(1,0))) ); this.forceMerge(); } @Test public void testStressMerge() throws IOException { this.addDocuments( doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2))) ); while (this.reader.numDocs() < 10000) { final int batchSize = LuceneTestCase.random().nextInt(20); final MockSirenDocument[] docs = new MockSirenDocument[batchSize]; for (int i = 0; i < batchSize; i++) { docs[i] = doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2))); } this.addDocuments(docs); this.forceMerge(); } final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader); final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa"))); assertTrue(docsEnum instanceof Siren10DocsEnum); final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum(); for (int i = 0; i < reader.numDocs(); i++) { assertTrue(e.nextDocument()); assertEquals(i, e.doc()); assertEquals(2, e.nodeFreqInDoc()); assertTrue(e.nextNode()); assertEquals(node(1), e.node()); assertEquals(1, e.termFreqInNode()); assertTrue(e.nextPosition()); assertEquals(0, e.pos()); assertTrue(e.nextNode()); assertEquals(node(2), e.node()); assertEquals(1, e.termFreqInNode()); assertTrue(e.nextPosition()); assertEquals(0, e.pos()); assertFalse(e.nextNode()); } } @Test public void testSkipDataCheckIndex() throws IOException { // The Lucene CheckIndex was catching a problem with how skip data level // were computed on this configuration. this.setPostingsFormat(new Siren10VIntPostingsFormat(256)); final MockSirenDocument[] docs = new MockSirenDocument[1000]; for (int i = 0; i < 1000; i++) { docs[i] = doc(token("aaa", node(1)), token("bbb", node(1,0)), token("aaa", node(2))); } this.addDocuments(docs); final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader); final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa"))); assertTrue(docsEnum instanceof Siren10DocsEnum); } @Test public void testDeltaNode() throws IOException { final MockSirenDocument[] docs = new MockSirenDocument[2048]; for (int i = 0; i < 2048; i += 2) { docs[i] = doc(token("aaa", node(1,1)), token("aaa", node(2,1)), token("aaa", node(2,5))); docs[i + 1] = doc(token("aaa", node(5,3,1)), token("aaa", node(5,3,6,3)), token("aaa", node(5,3,6,5)), token("aaa", node(6))); } this.addDocuments(docs); final AtomicReader aReader = SlowCompositeReaderWrapper.wrap(reader); final DocsEnum docsEnum = aReader.termDocsEnum(new Term(DEFAULT_TEST_FIELD, new BytesRef("aaa"))); assertTrue(docsEnum instanceof Siren10DocsEnum); final Siren10DocsNodesAndPositionsEnum e = ((Siren10DocsEnum) docsEnum).getDocsNodesAndPositionsEnum(); for (int i = 0; i < 2048; i += 2) { assertTrue(e.nextDocument()); assertTrue(e.nextNode()); assertEquals(node(1,1), e.node()); assertTrue(e.nextNode()); assertEquals(node(2,1), e.node()); assertTrue(e.nextNode()); assertEquals(node(2,5), e.node()); assertTrue(e.nextDocument()); assertTrue(e.nextNode()); assertEquals(node(5,3,1), e.node()); assertTrue(e.nextNode()); assertEquals(node(5,3,6,3), e.node()); assertTrue(e.nextNode()); assertEquals(node(5,3,6,5), e.node()); assertTrue(e.nextNode()); assertEquals(node(6), e.node()); } } }