package org.apache.lucene.index.sorter; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Arrays; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Fields; import org.apache.lucene.index.FilterAtomicReader; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Sort; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMFile; import org.apache.lucene.store.RAMInputStream; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.TimSorter; import org.apache.lucene.util.automaton.CompiledAutomaton; /** * An {@link AtomicReader} which supports sorting documents by a given * {@link Sort}. You can use this class to sort an index as follows: * * <pre class="prettyprint"> * IndexWriter writer; // writer to which the sorted index will be added * DirectoryReader reader; // reader on the input index * Sort sort; // determines how the documents are sorted * AtomicReader sortingReader = SortingAtomicReader.wrap(SlowCompositeReaderWrapper.wrap(reader), sort); * writer.addIndexes(reader); * writer.close(); * reader.close(); * </pre> * * @lucene.experimental */ public class SortingAtomicReader extends FilterAtomicReader { private static class SortingFields extends FilterFields { private final Sorter.DocMap docMap; private final FieldInfos infos; public SortingFields(final Fields in, FieldInfos infos, Sorter.DocMap docMap) { super(in); this.docMap = docMap; this.infos = infos; } @Override public Terms terms(final String field) throws IOException { Terms terms = in.terms(field); if (terms == null) { return null; } else { return new SortingTerms(terms, infos.fieldInfo(field).getIndexOptions(), docMap); } } } private static class SortingTerms extends FilterTerms { private final Sorter.DocMap docMap; private final IndexOptions indexOptions; public SortingTerms(final Terms in, IndexOptions indexOptions, final Sorter.DocMap docMap) { super(in); this.docMap = docMap; this.indexOptions = indexOptions; } @Override public TermsEnum iterator(final TermsEnum reuse) throws IOException { return new SortingTermsEnum(in.iterator(reuse), docMap, indexOptions); } @Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { return new SortingTermsEnum(in.intersect(compiled, startTerm), docMap, indexOptions); } } private static class SortingTermsEnum extends FilterTermsEnum { final Sorter.DocMap docMap; // pkg-protected to avoid synthetic accessor methods private final IndexOptions indexOptions; public SortingTermsEnum(final TermsEnum in, Sorter.DocMap docMap, IndexOptions indexOptions) { super(in); this.docMap = docMap; this.indexOptions = indexOptions; } Bits newToOld(final Bits liveDocs) { if (liveDocs == null) { return null; } return new Bits() { @Override public boolean get(int index) { return liveDocs.get(docMap.oldToNew(index)); } @Override public int length() { return liveDocs.length(); } }; } @Override public DocsEnum docs(Bits liveDocs, DocsEnum reuse, final int flags) throws IOException { final DocsEnum inReuse; final SortingDocsEnum wrapReuse; if (reuse != null && reuse instanceof SortingDocsEnum) { // if we're asked to reuse the given DocsEnum and it is Sorting, return // the wrapped one, since some Codecs expect it. wrapReuse = (SortingDocsEnum) reuse; inReuse = wrapReuse.getWrapped(); } else { wrapReuse = null; inReuse = reuse; } final DocsEnum inDocs = in.docs(newToOld(liveDocs), inReuse, flags); final boolean withFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >=0 && (flags & DocsEnum.FLAG_FREQS) != 0; return new SortingDocsEnum(docMap.size(), wrapReuse, inDocs, withFreqs, docMap); } @Override public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, final int flags) throws IOException { final DocsAndPositionsEnum inReuse; final SortingDocsAndPositionsEnum wrapReuse; if (reuse != null && reuse instanceof SortingDocsAndPositionsEnum) { // if we're asked to reuse the given DocsEnum and it is Sorting, return // the wrapped one, since some Codecs expect it. wrapReuse = (SortingDocsAndPositionsEnum) reuse; inReuse = wrapReuse.getWrapped(); } else { wrapReuse = null; inReuse = reuse; } final DocsAndPositionsEnum inDocsAndPositions = in.docsAndPositions(newToOld(liveDocs), inReuse, flags); if (inDocsAndPositions == null) { return null; } // we ignore the fact that offsets may be stored but not asked for, // since this code is expected to be used during addIndexes which will // ask for everything. if that assumption changes in the future, we can // factor in whether 'flags' says offsets are not required. final boolean storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; return new SortingDocsAndPositionsEnum(docMap.size(), wrapReuse, inDocsAndPositions, docMap, storeOffsets); } } private static class SortingBinaryDocValues extends BinaryDocValues { private final BinaryDocValues in; private final Sorter.DocMap docMap; SortingBinaryDocValues(BinaryDocValues in, Sorter.DocMap docMap) { this.in = in; this.docMap = docMap; } @Override public void get(int docID, BytesRef result) { in.get(docMap.newToOld(docID), result); } } private static class SortingNumericDocValues extends NumericDocValues { private final NumericDocValues in; private final Sorter.DocMap docMap; public SortingNumericDocValues(final NumericDocValues in, Sorter.DocMap docMap) { this.in = in; this.docMap = docMap; } @Override public long get(int docID) { return in.get(docMap.newToOld(docID)); } } private static class SortingBits implements Bits { private final Bits in; private final Sorter.DocMap docMap; public SortingBits(final Bits in, Sorter.DocMap docMap) { this.in = in; this.docMap = docMap; } @Override public boolean get(int index) { return in.get(docMap.newToOld(index)); } @Override public int length() { return in.length(); } } private static class SortingSortedDocValues extends SortedDocValues { private final SortedDocValues in; private final Sorter.DocMap docMap; SortingSortedDocValues(SortedDocValues in, Sorter.DocMap docMap) { this.in = in; this.docMap = docMap; } @Override public int getOrd(int docID) { return in.getOrd(docMap.newToOld(docID)); } @Override public void lookupOrd(int ord, BytesRef result) { in.lookupOrd(ord, result); } @Override public int getValueCount() { return in.getValueCount(); } @Override public void get(int docID, BytesRef result) { in.get(docMap.newToOld(docID), result); } @Override public int lookupTerm(BytesRef key) { return in.lookupTerm(key); } } private static class SortingSortedSetDocValues extends SortedSetDocValues { private final SortedSetDocValues in; private final Sorter.DocMap docMap; SortingSortedSetDocValues(SortedSetDocValues in, Sorter.DocMap docMap) { this.in = in; this.docMap = docMap; } @Override public long nextOrd() { return in.nextOrd(); } @Override public void setDocument(int docID) { in.setDocument(docMap.newToOld(docID)); } @Override public void lookupOrd(long ord, BytesRef result) { in.lookupOrd(ord, result); } @Override public long getValueCount() { return in.getValueCount(); } @Override public long lookupTerm(BytesRef key) { return in.lookupTerm(key); } } static class SortingDocsEnum extends FilterDocsEnum { private static final class DocFreqSorter extends TimSorter { private int[] docs; private int[] freqs; private final int[] tmpDocs; private int[] tmpFreqs; public DocFreqSorter(int maxDoc) { super(maxDoc / 64); this.tmpDocs = new int[maxDoc / 64]; } public void reset(int[] docs, int[] freqs) { this.docs = docs; this.freqs = freqs; if (freqs != null && tmpFreqs == null) { tmpFreqs = new int[tmpDocs.length]; } } @Override protected int compare(int i, int j) { return docs[i] - docs[j]; } @Override protected void swap(int i, int j) { int tmpDoc = docs[i]; docs[i] = docs[j]; docs[j] = tmpDoc; if (freqs != null) { int tmpFreq = freqs[i]; freqs[i] = freqs[j]; freqs[j] = tmpFreq; } } @Override protected void copy(int src, int dest) { docs[dest] = docs[src]; if (freqs != null) { freqs[dest] = freqs[src]; } } @Override protected void save(int i, int len) { System.arraycopy(docs, i, tmpDocs, 0, len); if (freqs != null) { System.arraycopy(freqs, i, tmpFreqs, 0, len); } } @Override protected void restore(int i, int j) { docs[j] = tmpDocs[i]; if (freqs != null) { freqs[j] = tmpFreqs[i]; } } @Override protected int compareSaved(int i, int j) { return tmpDocs[i] - docs[j]; } } private final int maxDoc; private final DocFreqSorter sorter; private int[] docs; private int[] freqs; private int docIt = -1; private final int upto; private final boolean withFreqs; SortingDocsEnum(int maxDoc, SortingDocsEnum reuse, final DocsEnum in, boolean withFreqs, final Sorter.DocMap docMap) throws IOException { super(in); this.maxDoc = maxDoc; this.withFreqs = withFreqs; if (reuse != null) { if (reuse.maxDoc == maxDoc) { sorter = reuse.sorter; } else { sorter = new DocFreqSorter(maxDoc); } docs = reuse.docs; freqs = reuse.freqs; // maybe null } else { docs = new int[64]; sorter = new DocFreqSorter(maxDoc); } docIt = -1; int i = 0; int doc; if (withFreqs) { if (freqs == null || freqs.length < docs.length) { freqs = new int[docs.length]; } while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS){ if (i >= docs.length) { docs = ArrayUtil.grow(docs, docs.length + 1); freqs = ArrayUtil.grow(freqs, freqs.length + 1); } docs[i] = docMap.oldToNew(doc); freqs[i] = in.freq(); ++i; } } else { freqs = null; while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS){ if (i >= docs.length) { docs = ArrayUtil.grow(docs, docs.length + 1); } docs[i++] = docMap.oldToNew(doc); } } // TimSort can save much time compared to other sorts in case of // reverse sorting, or when sorting a concatenation of sorted readers sorter.reset(docs, freqs); sorter.sort(0, i); upto = i; } // for testing boolean reused(DocsEnum other) { if (other == null || !(other instanceof SortingDocsEnum)) { return false; } return docs == ((SortingDocsEnum) other).docs; } @Override public int advance(final int target) throws IOException { // need to support it for checkIndex, but in practice it won't be called, so // don't bother to implement efficiently for now. return slowAdvance(target); } @Override public int docID() { return docIt < 0 ? -1 : docIt >= upto ? NO_MORE_DOCS : docs[docIt]; } @Override public int freq() throws IOException { return withFreqs && docIt < upto ? freqs[docIt] : 1; } @Override public int nextDoc() throws IOException { if (++docIt >= upto) return NO_MORE_DOCS; return docs[docIt]; } /** Returns the wrapped {@link DocsEnum}. */ DocsEnum getWrapped() { return in; } } static class SortingDocsAndPositionsEnum extends FilterDocsAndPositionsEnum { /** * A {@link TimSorter} which sorts two parallel arrays of doc IDs and * offsets in one go. Everytime a doc ID is 'swapped', its correponding offset * is swapped too. */ private static final class DocOffsetSorter extends TimSorter { private int[] docs; private long[] offsets; private final int[] tmpDocs; private final long[] tmpOffsets; public DocOffsetSorter(int maxDoc) { super(maxDoc / 64); this.tmpDocs = new int[maxDoc / 64]; this.tmpOffsets = new long[maxDoc / 64]; } public void reset(int[] docs, long[] offsets) { this.docs = docs; this.offsets = offsets; } @Override protected int compare(int i, int j) { return docs[i] - docs[j]; } @Override protected void swap(int i, int j) { int tmpDoc = docs[i]; docs[i] = docs[j]; docs[j] = tmpDoc; long tmpOffset = offsets[i]; offsets[i] = offsets[j]; offsets[j] = tmpOffset; } @Override protected void copy(int src, int dest) { docs[dest] = docs[src]; offsets[dest] = offsets[src]; } @Override protected void save(int i, int len) { System.arraycopy(docs, i, tmpDocs, 0, len); System.arraycopy(offsets, i, tmpOffsets, 0, len); } @Override protected void restore(int i, int j) { docs[j] = tmpDocs[i]; offsets[j] = tmpOffsets[i]; } @Override protected int compareSaved(int i, int j) { return tmpDocs[i] - docs[j]; } } private final int maxDoc; private final DocOffsetSorter sorter; private int[] docs; private long[] offsets; private final int upto; private final IndexInput postingInput; private final boolean storeOffsets; private int docIt = -1; private int pos; private int startOffset = -1; private int endOffset = -1; private final BytesRef payload; private int currFreq; private final RAMFile file; SortingDocsAndPositionsEnum(int maxDoc, SortingDocsAndPositionsEnum reuse, final DocsAndPositionsEnum in, Sorter.DocMap docMap, boolean storeOffsets) throws IOException { super(in); this.maxDoc = maxDoc; this.storeOffsets = storeOffsets; if (reuse != null) { docs = reuse.docs; offsets = reuse.offsets; payload = reuse.payload; file = reuse.file; if (reuse.maxDoc == maxDoc) { sorter = reuse.sorter; } else { sorter = new DocOffsetSorter(maxDoc); } } else { docs = new int[32]; offsets = new long[32]; payload = new BytesRef(32); file = new RAMFile(); sorter = new DocOffsetSorter(maxDoc); } final IndexOutput out = new RAMOutputStream(file); int doc; int i = 0; while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (i == docs.length) { final int newLength = ArrayUtil.oversize(i + 1, 4); docs = Arrays.copyOf(docs, newLength); offsets = Arrays.copyOf(offsets, newLength); } docs[i] = docMap.oldToNew(doc); offsets[i] = out.getFilePointer(); addPositions(in, out); i++; } upto = i; sorter.reset(docs, offsets); sorter.sort(0, upto); out.close(); this.postingInput = new RAMInputStream("", file); } // for testing boolean reused(DocsAndPositionsEnum other) { if (other == null || !(other instanceof SortingDocsAndPositionsEnum)) { return false; } return docs == ((SortingDocsAndPositionsEnum) other).docs; } private void addPositions(final DocsAndPositionsEnum in, final IndexOutput out) throws IOException { int freq = in.freq(); out.writeVInt(freq); int previousPosition = 0; int previousEndOffset = 0; for (int i = 0; i < freq; i++) { final int pos = in.nextPosition(); final BytesRef payload = in.getPayload(); // The low-order bit of token is set only if there is a payload, the // previous bits are the delta-encoded position. final int token = (pos - previousPosition) << 1 | (payload == null ? 0 : 1); out.writeVInt(token); previousPosition = pos; if (storeOffsets) { // don't encode offsets if they are not stored final int startOffset = in.startOffset(); final int endOffset = in.endOffset(); out.writeVInt(startOffset - previousEndOffset); out.writeVInt(endOffset - startOffset); previousEndOffset = endOffset; } if (payload != null) { out.writeVInt(payload.length); out.writeBytes(payload.bytes, payload.offset, payload.length); } } } @Override public int advance(final int target) throws IOException { // need to support it for checkIndex, but in practice it won't be called, so // don't bother to implement efficiently for now. return slowAdvance(target); } @Override public int docID() { return docIt < 0 ? -1 : docIt >= upto ? NO_MORE_DOCS : docs[docIt]; } @Override public int endOffset() throws IOException { return endOffset; } @Override public int freq() throws IOException { return currFreq; } @Override public BytesRef getPayload() throws IOException { return payload.length == 0 ? null : payload; } @Override public int nextDoc() throws IOException { if (++docIt >= upto) return DocIdSetIterator.NO_MORE_DOCS; postingInput.seek(offsets[docIt]); currFreq = postingInput.readVInt(); // reset variables used in nextPosition pos = 0; endOffset = 0; return docs[docIt]; } @Override public int nextPosition() throws IOException { final int token = postingInput.readVInt(); pos += token >>> 1; if (storeOffsets) { startOffset = endOffset + postingInput.readVInt(); endOffset = startOffset + postingInput.readVInt(); } if ((token & 1) != 0) { payload.offset = 0; payload.length = postingInput.readVInt(); if (payload.length > payload.bytes.length) { payload.bytes = new byte[ArrayUtil.oversize(payload.length, 1)]; } postingInput.readBytes(payload.bytes, 0, payload.length); } else { payload.length = 0; } return pos; } @Override public int startOffset() throws IOException { return startOffset; } /** Returns the wrapped {@link DocsAndPositionsEnum}. */ DocsAndPositionsEnum getWrapped() { return in; } } /** Return a sorted view of <code>reader</code> according to the order * defined by <code>sort</code>. If the reader is already sorted, this * method might return the reader as-is. */ public static AtomicReader wrap(AtomicReader reader, Sort sort) throws IOException { return wrap(reader, new Sorter(sort).sort(reader)); } /** Expert: same as {@link #wrap(AtomicReader, Sort)} but operates directly on a {@link Sorter.DocMap}. */ static AtomicReader wrap(AtomicReader reader, Sorter.DocMap docMap) { if (docMap == null) { // the reader is already sorter return reader; } if (reader.maxDoc() != docMap.size()) { throw new IllegalArgumentException("reader.maxDoc() should be equal to docMap.size(), got" + reader.maxDoc() + " != " + docMap.size()); } assert Sorter.isConsistent(docMap); return new SortingAtomicReader(reader, docMap); } final Sorter.DocMap docMap; // pkg-protected to avoid synthetic accessor methods private SortingAtomicReader(final AtomicReader in, final Sorter.DocMap docMap) { super(in); this.docMap = docMap; } @Override public void document(final int docID, final StoredFieldVisitor visitor) throws IOException { in.document(docMap.newToOld(docID), visitor); } @Override public Fields fields() throws IOException { Fields fields = in.fields(); if (fields == null) { return null; } else { return new SortingFields(fields, in.getFieldInfos(), docMap); } } @Override public BinaryDocValues getBinaryDocValues(String field) throws IOException { BinaryDocValues oldDocValues = in.getBinaryDocValues(field); if (oldDocValues == null) { return null; } else { return new SortingBinaryDocValues(oldDocValues, docMap); } } @Override public Bits getLiveDocs() { final Bits inLiveDocs = in.getLiveDocs(); if (inLiveDocs == null) { return null; } else { return new SortingBits(inLiveDocs, docMap); } } @Override public NumericDocValues getNormValues(String field) throws IOException { final NumericDocValues norm = in.getNormValues(field); if (norm == null) { return null; } else { return new SortingNumericDocValues(norm, docMap); } } @Override public NumericDocValues getNumericDocValues(String field) throws IOException { final NumericDocValues oldDocValues = in.getNumericDocValues(field); if (oldDocValues == null) return null; return new SortingNumericDocValues(oldDocValues, docMap); } @Override public SortedDocValues getSortedDocValues(String field) throws IOException { SortedDocValues sortedDV = in.getSortedDocValues(field); if (sortedDV == null) { return null; } else { return new SortingSortedDocValues(sortedDV, docMap); } } @Override public SortedSetDocValues getSortedSetDocValues(String field) throws IOException { SortedSetDocValues sortedSetDV = in.getSortedSetDocValues(field); if (sortedSetDV == null) { return null; } else { return new SortingSortedSetDocValues(sortedSetDV, docMap); } } @Override public Bits getDocsWithField(String field) throws IOException { Bits bits = in.getDocsWithField(field); if (bits == null || bits instanceof Bits.MatchAllBits || bits instanceof Bits.MatchNoBits) { return bits; } else { return new SortingBits(bits, docMap); } } @Override public Fields getTermVectors(final int docID) throws IOException { return in.getTermVectors(docMap.newToOld(docID)); } }