package org.apache.lucene.index.sorter; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Random; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.SlowCompositeReaderWrapper; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum.SeekStatus; import org.apache.lucene.index.sorter.SortingAtomicReader.SortingDocsAndPositionsEnum; import org.apache.lucene.index.sorter.SortingAtomicReader.SortingDocsEnum; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; public abstract class SorterTestBase extends LuceneTestCase { static final class NormsSimilarity extends Similarity { private final Similarity in; public NormsSimilarity(Similarity in) { this.in = in; } @Override public long computeNorm(FieldInvertState state) { if (state.getName().equals(NORMS_FIELD)) { return Float.floatToIntBits(state.getBoost()); } else { return in.computeNorm(state); } } @Override public SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) { return in.computeWeight(queryBoost, collectionStats, termStats); } @Override public SimScorer simScorer(SimWeight weight, AtomicReaderContext context) throws IOException { return in.simScorer(weight, context); } } static final class PositionsTokenStream extends TokenStream { private final CharTermAttribute term; private final PayloadAttribute payload; private final OffsetAttribute offset; private int pos, off; public PositionsTokenStream() { term = addAttribute(CharTermAttribute.class); payload = addAttribute(PayloadAttribute.class); offset = addAttribute(OffsetAttribute.class); } @Override public boolean incrementToken() throws IOException { if (pos == 0) { return false; } clearAttributes(); term.append(DOC_POSITIONS_TERM); payload.setPayload(new BytesRef(Integer.toString(pos))); offset.setOffset(off, off); --pos; ++off; return true; } void setId(int id) { pos = id / 10 + 1; off = 0; } } protected static final String ID_FIELD = "id"; protected static final String DOCS_ENUM_FIELD = "docs"; protected static final String DOCS_ENUM_TERM = "$all$"; protected static final String DOC_POSITIONS_FIELD = "positions"; protected static final String DOC_POSITIONS_TERM = "$all$"; protected static final String NUMERIC_DV_FIELD = "numeric"; protected static final String NORMS_FIELD = "norm"; protected static final String BINARY_DV_FIELD = "binary"; protected static final String SORTED_DV_FIELD = "sorted"; protected static final String SORTED_SET_DV_FIELD = "sorted_set"; protected static final String TERM_VECTORS_FIELD = "term_vectors"; private static final FieldType TERM_VECTORS_TYPE = new FieldType(TextField.TYPE_NOT_STORED); static { TERM_VECTORS_TYPE.setStoreTermVectors(true); TERM_VECTORS_TYPE.freeze(); } private static final FieldType POSITIONS_TYPE = new FieldType(TextField.TYPE_NOT_STORED); static { POSITIONS_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); POSITIONS_TYPE.freeze(); } protected static Directory dir; protected static AtomicReader reader; protected static Integer[] sortedValues; private static Document doc(final int id, PositionsTokenStream positions) { final Document doc = new Document(); doc.add(new StringField(ID_FIELD, Integer.toString(id), Store.YES)); doc.add(new StringField(DOCS_ENUM_FIELD, DOCS_ENUM_TERM, Store.NO)); positions.setId(id); if (doesntSupportOffsets.contains(TestUtil.getPostingsFormat(DOC_POSITIONS_FIELD))) { // codec doesnt support offsets: just index positions for the field doc.add(new Field(DOC_POSITIONS_FIELD, positions, TextField.TYPE_NOT_STORED)); } else { doc.add(new Field(DOC_POSITIONS_FIELD, positions, POSITIONS_TYPE)); } doc.add(new NumericDocValuesField(NUMERIC_DV_FIELD, id)); TextField norms = new TextField(NORMS_FIELD, Integer.toString(id), Store.NO); norms.setBoost(Float.intBitsToFloat(id)); doc.add(norms); doc.add(new BinaryDocValuesField(BINARY_DV_FIELD, new BytesRef(Integer.toString(id)))); doc.add(new SortedDocValuesField(SORTED_DV_FIELD, new BytesRef(Integer.toString(id)))); if (defaultCodecSupportsSortedSet()) { doc.add(new SortedSetDocValuesField(SORTED_SET_DV_FIELD, new BytesRef(Integer.toString(id)))); doc.add(new SortedSetDocValuesField(SORTED_SET_DV_FIELD, new BytesRef(Integer.toString(id + 1)))); } doc.add(new Field(TERM_VECTORS_FIELD, Integer.toString(id), TERM_VECTORS_TYPE)); return doc; } /** Creates an index for sorting. */ public static void createIndex(Directory dir, int numDocs, Random random) throws IOException { List<Integer> ids = new ArrayList<>(); for (int i = 0; i < numDocs; i++) { ids.add(Integer.valueOf(i * 10)); } // shuffle them for indexing Collections.shuffle(ids, random); if (VERBOSE) { System.out.println("Shuffled IDs for indexing: " + Arrays.toString(ids.toArray())); } PositionsTokenStream positions = new PositionsTokenStream(); IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)); conf.setMaxBufferedDocs(4); // create some segments conf.setSimilarity(new NormsSimilarity(conf.getSimilarity())); // for testing norms field RandomIndexWriter writer = new RandomIndexWriter(random, dir, conf); writer.setDoRandomForceMerge(false); for (int id : ids) { writer.addDocument(doc(id, positions)); } // delete some documents writer.commit(); for (Integer id : ids) { if (random.nextDouble() < 0.2) { if (VERBOSE) { System.out.println("delete doc_id " + id); } writer.deleteDocuments(new Term(ID_FIELD, id.toString())); } } writer.close(); } @BeforeClass public static void beforeClassSorterTestBase() throws Exception { dir = newDirectory(); int numDocs = atLeast(20); createIndex(dir, numDocs, random()); reader = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(dir)); } @AfterClass public static void afterClassSorterTestBase() throws Exception { reader.close(); dir.close(); } @Test public void testBinaryDocValuesField() throws Exception { BinaryDocValues dv = reader.getBinaryDocValues(BINARY_DV_FIELD); BytesRef bytes = new BytesRef(); for (int i = 0; i < reader.maxDoc(); i++) { dv.get(i, bytes); assertEquals("incorrect binary DocValues for doc " + i, sortedValues[i].toString(), bytes.utf8ToString()); } } @Test public void testDocsAndPositionsEnum() throws Exception { TermsEnum termsEnum = reader.terms(DOC_POSITIONS_FIELD).iterator(null); assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef(DOC_POSITIONS_TERM))); DocsAndPositionsEnum sortedPositions = termsEnum.docsAndPositions(null, null); int doc; // test nextDoc() while ((doc = sortedPositions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { int freq = sortedPositions.freq(); assertEquals("incorrect freq for doc=" + doc, sortedValues[doc].intValue() / 10 + 1, freq); for (int i = 0; i < freq; i++) { assertEquals("incorrect position for doc=" + doc, i, sortedPositions.nextPosition()); if (!doesntSupportOffsets.contains(TestUtil.getPostingsFormat(DOC_POSITIONS_FIELD))) { assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.startOffset()); assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.endOffset()); } assertEquals("incorrect payload for doc=" + doc, freq - i, Integer.parseInt(sortedPositions.getPayload().utf8ToString())); } } // test advance() final DocsAndPositionsEnum reuse = sortedPositions; sortedPositions = termsEnum.docsAndPositions(null, reuse); if (sortedPositions instanceof SortingDocsAndPositionsEnum) { assertTrue(((SortingDocsAndPositionsEnum) sortedPositions).reused(reuse)); // make sure reuse worked } doc = 0; while ((doc = sortedPositions.advance(doc + TestUtil.nextInt(random(), 1, 5))) != DocIdSetIterator.NO_MORE_DOCS) { int freq = sortedPositions.freq(); assertEquals("incorrect freq for doc=" + doc, sortedValues[doc].intValue() / 10 + 1, freq); for (int i = 0; i < freq; i++) { assertEquals("incorrect position for doc=" + doc, i, sortedPositions.nextPosition()); if (!doesntSupportOffsets.contains(TestUtil.getPostingsFormat(DOC_POSITIONS_FIELD))) { assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.startOffset()); assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.endOffset()); } assertEquals("incorrect payload for doc=" + doc, freq - i, Integer.parseInt(sortedPositions.getPayload().utf8ToString())); } } } Bits randomLiveDocs(int maxDoc) { if (rarely()) { if (random().nextBoolean()) { return null; } else { return new Bits.MatchNoBits(maxDoc); } } final FixedBitSet bits = new FixedBitSet(maxDoc); final int bitsSet = TestUtil.nextInt(random(), 1, maxDoc - 1); for (int i = 0; i < bitsSet; ++i) { while (true) { final int index = random().nextInt(maxDoc); if (!bits.get(index)) { bits.set(index); break; } } } return bits; } @Test public void testDocsEnum() throws Exception { Bits mappedLiveDocs = randomLiveDocs(reader.maxDoc()); TermsEnum termsEnum = reader.terms(DOCS_ENUM_FIELD).iterator(null); assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef(DOCS_ENUM_TERM))); DocsEnum docs = termsEnum.docs(mappedLiveDocs, null); int doc; int prev = -1; while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { assertTrue("document " + doc + " marked as deleted", mappedLiveDocs == null || mappedLiveDocs.get(doc)); assertEquals("incorrect value; doc " + doc, sortedValues[doc].intValue(), Integer.parseInt(reader.document(doc).get(ID_FIELD))); while (++prev < doc) { assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.get(prev)); } } while (++prev < reader.maxDoc()) { assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.get(prev)); } DocsEnum reuse = docs; docs = termsEnum.docs(mappedLiveDocs, reuse); if (docs instanceof SortingDocsEnum) { assertTrue(((SortingDocsEnum) docs).reused(reuse)); // make sure reuse worked } doc = -1; prev = -1; while ((doc = docs.advance(doc + 1)) != DocIdSetIterator.NO_MORE_DOCS) { assertTrue("document " + doc + " marked as deleted", mappedLiveDocs == null || mappedLiveDocs.get(doc)); assertEquals("incorrect value; doc " + doc, sortedValues[doc].intValue(), Integer.parseInt(reader.document(doc).get(ID_FIELD))); while (++prev < doc) { assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.get(prev)); } } while (++prev < reader.maxDoc()) { assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.get(prev)); } } @Test public void testNormValues() throws Exception { NumericDocValues dv = reader.getNormValues(NORMS_FIELD); int maxDoc = reader.maxDoc(); for (int i = 0; i < maxDoc; i++) { assertEquals("incorrect norm value for doc " + i, sortedValues[i].intValue(), dv.get(i)); } } @Test public void testNumericDocValuesField() throws Exception { NumericDocValues dv = reader.getNumericDocValues(NUMERIC_DV_FIELD); int maxDoc = reader.maxDoc(); for (int i = 0; i < maxDoc; i++) { assertEquals("incorrect numeric DocValues for doc " + i, sortedValues[i].intValue(), dv.get(i)); } } @Test public void testSortedDocValuesField() throws Exception { SortedDocValues dv = reader.getSortedDocValues(SORTED_DV_FIELD); int maxDoc = reader.maxDoc(); BytesRef bytes = new BytesRef(); for (int i = 0; i < maxDoc; i++) { dv.get(i, bytes); assertEquals("incorrect sorted DocValues for doc " + i, sortedValues[i].toString(), bytes.utf8ToString()); } } @Test public void testSortedSetDocValuesField() throws Exception { assumeTrue("default codec does not support SORTED_SET", defaultCodecSupportsSortedSet()); SortedSetDocValues dv = reader.getSortedSetDocValues(SORTED_SET_DV_FIELD); int maxDoc = reader.maxDoc(); BytesRef bytes = new BytesRef(); for (int i = 0; i < maxDoc; i++) { dv.setDocument(i); dv.lookupOrd(dv.nextOrd(), bytes); int value = sortedValues[i].intValue(); assertEquals("incorrect sorted-set DocValues for doc " + i, Integer.valueOf(value).toString(), bytes.utf8ToString()); dv.lookupOrd(dv.nextOrd(), bytes); assertEquals("incorrect sorted-set DocValues for doc " + i, Integer.valueOf(value + 1).toString(), bytes.utf8ToString()); assertEquals(SortedSetDocValues.NO_MORE_ORDS, dv.nextOrd()); } } @Test public void testTermVectors() throws Exception { int maxDoc = reader.maxDoc(); for (int i = 0; i < maxDoc; i++) { Terms terms = reader.getTermVector(i, TERM_VECTORS_FIELD); assertNotNull("term vectors not found for doc " + i + " field [" + TERM_VECTORS_FIELD + "]", terms); assertEquals("incorrect term vector for doc " + i, sortedValues[i].toString(), terms.iterator(null).next().utf8ToString()); } } }