package org.apache.lucene.index.codecs.preflex; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Collection; import java.util.Iterator; import java.util.TreeMap; import java.util.Comparator; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FieldsEnum; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.CompoundFileReader; import org.apache.lucene.index.codecs.FieldsProducer; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.UnicodeUtil; /** Exposes flex API on a pre-flex index, as a codec. * @lucene.experimental */ public class PreFlexFields extends FieldsProducer { private static final boolean DEBUG_SURROGATES = false; public TermInfosReader tis; public final TermInfosReader tisNoIndex; public final IndexInput freqStream; public final IndexInput proxStream; final private FieldInfos fieldInfos; private final SegmentInfo si; final TreeMap<String,FieldInfo> fields = new TreeMap<String,FieldInfo>(); private final Directory dir; private final int readBufferSize; private Directory cfsReader; public PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor) throws IOException { si = info; // NOTE: we must always load terms index, even for // "sequential" scan during merging, because what is // sequential to merger may not be to TermInfosReader // since we do the surrogates dance: if (indexDivisor < 0) { indexDivisor = -indexDivisor; } TermInfosReader r = new TermInfosReader(dir, info.name, fieldInfos, readBufferSize, indexDivisor); if (indexDivisor == -1) { tisNoIndex = r; } else { tisNoIndex = null; tis = r; } this.readBufferSize = readBufferSize; this.fieldInfos = fieldInfos; // make sure that all index files have been read or are kept open // so that if an index update removes them we'll still have them freqStream = dir.openInput(info.name + ".frq", readBufferSize); boolean anyProx = false; final int numFields = fieldInfos.size(); for(int i=0;i<numFields;i++) { final FieldInfo fieldInfo = fieldInfos.fieldInfo(i); if (fieldInfo.isIndexed) { fields.put(fieldInfo.name, fieldInfo); if (!fieldInfo.omitTermFreqAndPositions) { anyProx = true; } } } if (anyProx) { proxStream = dir.openInput(info.name + ".prx", readBufferSize); } else { proxStream = null; } this.dir = dir; } // If this returns, we do the surrogates dance so that the // terms are sorted by unicode sort order. This should be // true when segments are used for "normal" searching; // it's only false during testing, to create a pre-flex // index, using the test-only PreFlexRW. protected boolean sortTermsByUnicode() { return true; } static void files(Directory dir, SegmentInfo info, Collection<String> files) throws IOException { files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.TERMS_EXTENSION)); files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.TERMS_INDEX_EXTENSION)); files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.FREQ_EXTENSION)); if (info.getHasProx()) { // LUCENE-1739: for certain versions of 2.9-dev, // hasProx would be incorrectly computed during // indexing as true, and then stored into the segments // file, when it should have been false. So we do the // extra check, here: final String prx = IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.PROX_EXTENSION); if (dir.fileExists(prx)) { files.add(prx); } } } @Override public FieldsEnum iterator() throws IOException { return new PreFlexFieldsEnum(); } @Override public Terms terms(String field) { FieldInfo fi = fieldInfos.fieldInfo(field); if (fi != null) { return new PreTerms(fi); } else { return null; } } synchronized private TermInfosReader getTermsDict() { if (tis != null) { return tis; } else { return tisNoIndex; } } @Override synchronized public void loadTermsIndex(int indexDivisor) throws IOException { if (tis == null) { Directory dir0; if (si.getUseCompoundFile()) { // In some cases, we were originally opened when CFS // was not used, but then we are asked to open the // terms reader with index, the segment has switched // to CFS if (!(dir instanceof CompoundFileReader)) { dir0 = cfsReader = new CompoundFileReader(dir, IndexFileNames.segmentFileName(si.name, "", IndexFileNames.COMPOUND_FILE_EXTENSION), readBufferSize); } else { dir0 = dir; } dir0 = cfsReader; } else { dir0 = dir; } tis = new TermInfosReader(dir0, si.name, fieldInfos, readBufferSize, indexDivisor); } } @Override public void close() throws IOException { if (tis != null) { tis.close(); } if (tisNoIndex != null) { tisNoIndex.close(); } if (cfsReader != null) { cfsReader.close(); } if (freqStream != null) { freqStream.close(); } if (proxStream != null) { proxStream.close(); } } private class PreFlexFieldsEnum extends FieldsEnum { final Iterator<FieldInfo> it; private final PreTermsEnum termsEnum; FieldInfo current; public PreFlexFieldsEnum() throws IOException { it = fields.values().iterator(); termsEnum = new PreTermsEnum(); } @Override public String next() { if (it.hasNext()) { current = it.next(); return current.name; } else { return null; } } @Override public TermsEnum terms() throws IOException { termsEnum.reset(current); return termsEnum; } } private class PreTerms extends Terms { final FieldInfo fieldInfo; PreTerms(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; } @Override public TermsEnum iterator() throws IOException { PreTermsEnum termsEnum = new PreTermsEnum(); termsEnum.reset(fieldInfo); return termsEnum; } @Override public Comparator<BytesRef> getComparator() { // Pre-flex indexes always sorted in UTF16 order, but // we remap on-the-fly to unicode order if (sortTermsByUnicode()) { return BytesRef.getUTF8SortedAsUnicodeComparator(); } else { return BytesRef.getUTF8SortedAsUTF16Comparator(); } } } private class PreTermsEnum extends TermsEnum { private SegmentTermEnum termEnum; private FieldInfo fieldInfo; private boolean skipNext; private BytesRef current; private SegmentTermEnum seekTermEnum; private Term protoTerm; private static final byte UTF8_NON_BMP_LEAD = (byte) 0xf0; private static final byte UTF8_HIGH_BMP_LEAD = (byte) 0xee; // Returns true if the unicode char is "after" the // surrogates in UTF16, ie >= U+E000 and <= U+FFFF: private final boolean isHighBMPChar(byte[] b, int idx) { return (b[idx] & UTF8_HIGH_BMP_LEAD) == UTF8_HIGH_BMP_LEAD; } // Returns true if the unicode char in the UTF8 byte // sequence starting at idx encodes a char outside of // BMP (ie what would be a surrogate pair in UTF16): private final boolean isNonBMPChar(byte[] b, int idx) { return (b[idx] & UTF8_NON_BMP_LEAD) == UTF8_NON_BMP_LEAD; } private final byte[] scratch = new byte[4]; private final BytesRef prevTerm = new BytesRef(); private final BytesRef scratchTerm = new BytesRef(); private int newSuffixStart; // Swap in S, in place of E: private boolean seekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) throws IOException { final int savLength = term.length; assert term.offset == 0; // The 3 bytes starting at downTo make up 1 // unicode character: assert isHighBMPChar(term.bytes, pos); // NOTE: we cannot make this assert, because // AutomatonQuery legitimately sends us malformed UTF8 // (eg the UTF8 bytes with just 0xee) // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString(); // Save the bytes && length, since we need to // restore this if seek "back" finds no matching // terms if (term.bytes.length < 4+pos) { term.grow(4+pos); } scratch[0] = term.bytes[pos]; scratch[1] = term.bytes[pos+1]; scratch[2] = term.bytes[pos+2]; term.bytes[pos] = (byte) 0xf0; term.bytes[pos+1] = (byte) 0x90; term.bytes[pos+2] = (byte) 0x80; term.bytes[pos+3] = (byte) 0x80; term.length = 4+pos; if (DEBUG_SURROGATES) { System.out.println(" try seek term=" + UnicodeUtil.toHexString(term.utf8ToString())); } // Seek "back": getTermsDict().seekEnum(te, protoTerm.createTerm(term)); // Test if the term we seek'd to in fact found a // surrogate pair at the same position as the E: Term t2 = te.term(); // Cannot be null (or move to next field) because at // "worst" it'd seek to the same term we are on now, // unless we are being called from seek if (t2 == null || t2.field() != fieldInfo.name) { return false; } if (DEBUG_SURROGATES) { System.out.println(" got term=" + UnicodeUtil.toHexString(t2.text())); } // Now test if prefix is identical and we found // a non-BMP char at the same position: BytesRef b2 = t2.bytes(); assert b2.offset == 0; boolean matches; if (b2.length >= term.length && isNonBMPChar(b2.bytes, pos)) { matches = true; for(int i=0;i<pos;i++) { if (term.bytes[i] != b2.bytes[i]) { matches = false; break; } } } else { matches = false; } // Restore term: term.length = savLength; term.bytes[pos] = scratch[0]; term.bytes[pos+1] = scratch[1]; term.bytes[pos+2] = scratch[2]; return matches; } // Seek type 2 "continue" (back to the start of the // surrogates): scan the stripped suffix from the // prior term, backwards. If there was an E in that // part, then we try to seek back to S. If that // seek finds a matching term, we go there. private boolean doContinue() throws IOException { if (DEBUG_SURROGATES) { System.out.println(" try cont"); } int downTo = prevTerm.length-1; boolean didSeek = false; final int limit = Math.min(newSuffixStart, scratchTerm.length-1); while(downTo > limit) { if (isHighBMPChar(prevTerm.bytes, downTo)) { if (DEBUG_SURROGATES) { System.out.println(" found E pos=" + downTo + " vs len=" + prevTerm.length); } if (seekToNonBMP(seekTermEnum, prevTerm, downTo)) { // TODO: more efficient seek? getTermsDict().seekEnum(termEnum, seekTermEnum.term()); //newSuffixStart = downTo+4; newSuffixStart = downTo; scratchTerm.copy(termEnum.term().bytes()); didSeek = true; if (DEBUG_SURROGATES) { System.out.println(" seek!"); } break; } else { if (DEBUG_SURROGATES) { System.out.println(" no seek"); } } } // Shorten prevTerm in place so that we don't redo // this loop if we come back here: if ((prevTerm.bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.bytes[downTo] & 0x80) == 0) { prevTerm.length = downTo; } downTo--; } return didSeek; } // Look for seek type 3 ("pop"): if the delta from // prev -> current was replacing an S with an E, // we must now seek to beyond that E. This seek // "finishes" the dance at this character // position. private boolean doPop() throws IOException { if (DEBUG_SURROGATES) { System.out.println(" try pop"); } assert newSuffixStart <= prevTerm.length; assert newSuffixStart < scratchTerm.length || newSuffixStart == 0; if (prevTerm.length > newSuffixStart && isNonBMPChar(prevTerm.bytes, newSuffixStart) && isHighBMPChar(scratchTerm.bytes, newSuffixStart)) { // Seek type 2 -- put 0xFF at this position: scratchTerm.bytes[newSuffixStart] = (byte) 0xff; scratchTerm.length = newSuffixStart+1; if (DEBUG_SURROGATES) { System.out.println(" seek to term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString()); } // TODO: more efficient seek? can we simply swap // the enums? getTermsDict().seekEnum(termEnum, protoTerm.createTerm(scratchTerm)); final Term t2 = termEnum.term(); // We could hit EOF or different field since this // was a seek "forward": if (t2 != null && t2.field() == fieldInfo.name) { if (DEBUG_SURROGATES) { System.out.println(" got term=" + UnicodeUtil.toHexString(t2.text()) + " " + t2.bytes()); } final BytesRef b2 = t2.bytes(); assert b2.offset == 0; // Set newSuffixStart -- we can't use // termEnum's since the above seek may have // done no scanning (eg, term was precisely // and index term, or, was in the term seek // cache): scratchTerm.copy(b2); setNewSuffixStart(prevTerm, scratchTerm); return true; } else if (newSuffixStart != 0 || scratchTerm.length != 0) { if (DEBUG_SURROGATES) { System.out.println(" got term=null (or next field)"); } newSuffixStart = 0; scratchTerm.length = 0; return true; } } return false; } // Pre-flex indices store terms in UTF16 sort order, but // certain queries require Unicode codepoint order; this // method carefully seeks around surrogates to handle // this impedance mismatch private void surrogateDance() throws IOException { if (!unicodeSortOrder) { return; } // We are invoked after TIS.next() (by UTF16 order) to // possibly seek to a different "next" (by unicode // order) term. // We scan only the "delta" from the last term to the // current term, in UTF8 bytes. We look at 1) the bytes // stripped from the prior term, and then 2) the bytes // appended to that prior term's prefix. // We don't care about specific UTF8 sequences, just // the "category" of the UTF16 character. Category S // is a high/low surrogate pair (it non-BMP). // Category E is any BMP char > UNI_SUR_LOW_END (and < // U+FFFF). Category A is the rest (any unicode char // <= UNI_SUR_HIGH_START). // The core issue is that pre-flex indices sort the // characters as ASE, while flex must sort as AES. So // when scanning, when we hit S, we must 1) seek // forward to E and enum the terms there, then 2) seek // back to S and enum all terms there, then 3) seek to // after E. Three different seek points (1, 2, 3). // We can easily detect S in UTF8: if a byte has // prefix 11110 (0xf0), then that byte and the // following 3 bytes encode a single unicode codepoint // in S. Similary,we can detect E: if a byte has // prefix 1110111 (0xee), then that byte and the // following 2 bytes encode a single unicode codepoint // in E. // Note that this is really a recursive process -- // maybe the char at pos 2 needs to dance, but any // point in its dance, suddenly pos 4 needs to dance // so you must finish pos 4 before returning to pos // 2. But then during pos 4's dance maybe pos 7 needs // to dance, etc. However, despite being recursive, // we don't need to hold any state because the state // can always be derived by looking at prior term & // current term. // TODO: can we avoid this copy? if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) { scratchTerm.length = 0; } else { scratchTerm.copy(termEnum.term().bytes()); } if (DEBUG_SURROGATES) { System.out.println(" dance"); System.out.println(" prev=" + UnicodeUtil.toHexString(prevTerm.utf8ToString())); System.out.println(" " + prevTerm.toString()); System.out.println(" term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString())); System.out.println(" " + scratchTerm.toString()); } // This code assumes TermInfosReader/SegmentTermEnum // always use BytesRef.offset == 0 assert prevTerm.offset == 0; assert scratchTerm.offset == 0; // Need to loop here because we may need to do multiple // pops, and possibly a continue in the end, ie: // // cont // pop, cont // pop, pop, cont // <nothing> // while(true) { if (doContinue()) { break; } else { if (!doPop()) { break; } } } if (DEBUG_SURROGATES) { System.out.println(" finish bmp ends"); } doPushes(); } // Look for seek type 1 ("push"): if the newly added // suffix contains any S, we must try to seek to the // corresponding E. If we find a match, we go there; // else we keep looking for additional S's in the new // suffix. This "starts" the dance, at this character // position: private void doPushes() throws IOException { int upTo = newSuffixStart; if (DEBUG_SURROGATES) { System.out.println(" try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.length); } while(upTo < scratchTerm.length) { if (isNonBMPChar(scratchTerm.bytes, upTo) && (upTo > newSuffixStart || (upTo >= prevTerm.length || (!isNonBMPChar(prevTerm.bytes, upTo) && !isHighBMPChar(prevTerm.bytes, upTo))))) { // A non-BMP char (4 bytes UTF8) starts here: assert scratchTerm.length >= upTo + 4; final int savLength = scratchTerm.length; scratch[0] = scratchTerm.bytes[upTo]; scratch[1] = scratchTerm.bytes[upTo+1]; scratch[2] = scratchTerm.bytes[upTo+2]; scratchTerm.bytes[upTo] = UTF8_HIGH_BMP_LEAD; scratchTerm.bytes[upTo+1] = (byte) 0x80; scratchTerm.bytes[upTo+2] = (byte) 0x80; scratchTerm.length = upTo+3; if (DEBUG_SURROGATES) { System.out.println(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString() + " len=" + scratchTerm.length); } // Seek "forward": // TODO: more efficient seek? getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(scratchTerm)); scratchTerm.bytes[upTo] = scratch[0]; scratchTerm.bytes[upTo+1] = scratch[1]; scratchTerm.bytes[upTo+2] = scratch[2]; scratchTerm.length = savLength; // Did we find a match? final Term t2 = seekTermEnum.term(); if (DEBUG_SURROGATES) { if (t2 == null) { System.out.println(" hit term=null"); } else { System.out.println(" hit term=" + UnicodeUtil.toHexString(t2.text()) + " " + (t2==null? null:t2.bytes())); } } // Since this was a seek "forward", we could hit // EOF or a different field: boolean matches; if (t2 != null && t2.field() == fieldInfo.name) { final BytesRef b2 = t2.bytes(); assert b2.offset == 0; if (b2.length >= upTo+3 && isHighBMPChar(b2.bytes, upTo)) { matches = true; for(int i=0;i<upTo;i++) { if (scratchTerm.bytes[i] != b2.bytes[i]) { matches = false; break; } } } else { matches = false; } } else { matches = false; } if (matches) { if (DEBUG_SURROGATES) { System.out.println(" matches!"); } // OK seek "back" // TODO: more efficient seek? getTermsDict().seekEnum(termEnum, seekTermEnum.term()); scratchTerm.copy(seekTermEnum.term().bytes()); // +3 because we don't need to check the char // at upTo: we know it's > BMP upTo += 3; // NOTE: we keep iterating, now, since this // can easily "recurse". Ie, after seeking // forward at a certain char position, we may // find another surrogate in our [new] suffix // and must then do another seek (recurse) } else { upTo++; } } else { upTo++; } } } private boolean unicodeSortOrder; void reset(FieldInfo fieldInfo) throws IOException { //System.out.println("pff.reset te=" + termEnum); this.fieldInfo = fieldInfo; protoTerm = new Term(fieldInfo.name); if (termEnum == null) { termEnum = getTermsDict().terms(protoTerm); seekTermEnum = getTermsDict().terms(protoTerm); //System.out.println(" term=" + termEnum.term()); } else { getTermsDict().seekEnum(termEnum, protoTerm); } skipNext = true; unicodeSortOrder = sortTermsByUnicode(); final Term t = termEnum.term(); if (t != null && t.field() == fieldInfo.name) { newSuffixStart = 0; prevTerm.length = 0; surrogateDance(); } } @Override public Comparator<BytesRef> getComparator() { // Pre-flex indexes always sorted in UTF16 order, but // we remap on-the-fly to unicode order if (unicodeSortOrder) { return BytesRef.getUTF8SortedAsUnicodeComparator(); } else { return BytesRef.getUTF8SortedAsUTF16Comparator(); } } @Override public SeekStatus seek(long ord) throws IOException { throw new UnsupportedOperationException(); } @Override public long ord() throws IOException { throw new UnsupportedOperationException(); } @Override public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { if (DEBUG_SURROGATES) { System.out.println("TE.seek target=" + UnicodeUtil.toHexString(term.utf8ToString())); } skipNext = false; final TermInfosReader tis = getTermsDict(); final Term t0 = protoTerm.createTerm(term); assert termEnum != null; tis.seekEnum(termEnum, t0); final Term t = termEnum.term(); if (t != null && t.field() == fieldInfo.name && term.bytesEquals(t.bytes())) { // If we found an exact match, no need to do the // surrogate dance if (DEBUG_SURROGATES) { System.out.println(" seek exact match"); } current = t.bytes(); return SeekStatus.FOUND; } else if (t == null || t.field() != fieldInfo.name) { // TODO: maybe we can handle this like the next() // into null? set term as prevTerm then dance? if (DEBUG_SURROGATES) { System.out.println(" seek hit EOF"); } // We hit EOF; try end-case surrogate dance: if we // find an E, try swapping in S, backwards: scratchTerm.copy(term); assert scratchTerm.offset == 0; for(int i=scratchTerm.length-1;i>=0;i--) { if (isHighBMPChar(scratchTerm.bytes, i)) { if (DEBUG_SURROGATES) { System.out.println(" found E pos=" + i + "; try seek"); } if (seekToNonBMP(seekTermEnum, scratchTerm, i)) { scratchTerm.copy(seekTermEnum.term().bytes()); getTermsDict().seekEnum(termEnum, seekTermEnum.term()); newSuffixStart = 1+i; doPushes(); // Found a match // TODO: faster seek? current = termEnum.term().bytes(); return SeekStatus.NOT_FOUND; } } } if (DEBUG_SURROGATES) { System.out.println(" seek END"); } current = null; return SeekStatus.END; } else { // We found a non-exact but non-null term; this one // is fun -- just treat it like next, by pretending // requested term was prev: prevTerm.copy(term); if (DEBUG_SURROGATES) { System.out.println(" seek hit non-exact term=" + UnicodeUtil.toHexString(t.text())); } final BytesRef br = t.bytes(); assert br.offset == 0; setNewSuffixStart(term, br); surrogateDance(); final Term t2 = termEnum.term(); if (t2 == null || t2.field() != fieldInfo.name) { assert t2 == null || !t2.field().equals(fieldInfo.name); // make sure fields are in fact interned current = null; return SeekStatus.END; } else { current = t2.bytes(); assert !unicodeSortOrder || term.compareTo(current) < 0 : "term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " vs current=" + UnicodeUtil.toHexString(current.utf8ToString()); return SeekStatus.NOT_FOUND; } } } private void setNewSuffixStart(BytesRef br1, BytesRef br2) { final int limit = Math.min(br1.length, br2.length); int lastStart = 0; for(int i=0;i<limit;i++) { if ((br1.bytes[br1.offset+i] & 0xc0) == 0xc0 || (br1.bytes[br1.offset+i] & 0x80) == 0) { lastStart = i; } if (br1.bytes[br1.offset+i] != br2.bytes[br2.offset+i]) { newSuffixStart = lastStart; if (DEBUG_SURROGATES) { System.out.println(" set newSuffixStart=" + newSuffixStart); } return; } } newSuffixStart = limit; if (DEBUG_SURROGATES) { System.out.println(" set newSuffixStart=" + newSuffixStart); } } @Override public BytesRef next() throws IOException { if (DEBUG_SURROGATES) { System.out.println("TE.next()"); } if (skipNext) { if (DEBUG_SURROGATES) { System.out.println(" skipNext=true"); } skipNext = false; if (termEnum.term() == null) { return null; } else if (termEnum.term().field() != fieldInfo.name) { return null; } else { return current = termEnum.term().bytes(); } } // TODO: can we use STE's prevBuffer here? prevTerm.copy(termEnum.term().bytes()); if (termEnum.next() && termEnum.term().field() == fieldInfo.name) { newSuffixStart = termEnum.newSuffixStart; if (DEBUG_SURROGATES) { System.out.println(" newSuffixStart=" + newSuffixStart); } surrogateDance(); final Term t = termEnum.term(); if (t == null || t.field() != fieldInfo.name) { assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned current = null; } else { current = t.bytes(); } return current; } else { // This field is exhausted, but we have to give // surrogateDance a chance to seek back: if (DEBUG_SURROGATES) { System.out.println(" force cont"); } //newSuffixStart = prevTerm.length; newSuffixStart = 0; surrogateDance(); final Term t = termEnum.term(); if (t == null || t.field() != fieldInfo.name) { assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned return null; } else { current = t.bytes(); return current; } } } @Override public BytesRef term() { return current; } @Override public int docFreq() { return termEnum.docFreq(); } @Override public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { PreDocsEnum docsEnum; if (reuse == null || !(reuse instanceof PreDocsEnum)) { docsEnum = new PreDocsEnum(); } else { docsEnum = (PreDocsEnum) reuse; if (docsEnum.getFreqStream() != freqStream) { docsEnum = new PreDocsEnum(); } } return docsEnum.reset(termEnum, skipDocs); } @Override public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { PreDocsAndPositionsEnum docsPosEnum; if (fieldInfo.omitTermFreqAndPositions) { return null; } else if (reuse == null || !(reuse instanceof PreDocsAndPositionsEnum)) { docsPosEnum = new PreDocsAndPositionsEnum(); } else { docsPosEnum = (PreDocsAndPositionsEnum) reuse; if (docsPosEnum.getFreqStream() != freqStream) { docsPosEnum = new PreDocsAndPositionsEnum(); } } return docsPosEnum.reset(termEnum, skipDocs); } } private final class PreDocsEnum extends DocsEnum { final private SegmentTermDocs docs; PreDocsEnum() throws IOException { docs = new SegmentTermDocs(freqStream, getTermsDict(), fieldInfos); } IndexInput getFreqStream() { return freqStream; } public PreDocsEnum reset(SegmentTermEnum termEnum, Bits skipDocs) throws IOException { docs.setSkipDocs(skipDocs); docs.seek(termEnum); return this; } @Override public int nextDoc() throws IOException { if (docs.next()) { return docs.doc(); } else { return NO_MORE_DOCS; } } @Override public int advance(int target) throws IOException { if (docs.skipTo(target)) { return docs.doc(); } else { return NO_MORE_DOCS; } } @Override public int freq() { return docs.freq(); } @Override public int docID() { return docs.doc(); } @Override public int read() throws IOException { if (bulkResult == null) { initBulkResult(); bulkResult.docs.ints = new int[32]; bulkResult.freqs.ints = new int[32]; } return this.docs.read(bulkResult.docs.ints, bulkResult.freqs.ints); } } private final class PreDocsAndPositionsEnum extends DocsAndPositionsEnum { final private SegmentTermPositions pos; PreDocsAndPositionsEnum() throws IOException { pos = new SegmentTermPositions(freqStream, proxStream, getTermsDict(), fieldInfos); } IndexInput getFreqStream() { return freqStream; } public DocsAndPositionsEnum reset(SegmentTermEnum termEnum, Bits skipDocs) throws IOException { pos.setSkipDocs(skipDocs); pos.seek(termEnum); return this; } @Override public int nextDoc() throws IOException { if (pos.next()) { return pos.doc(); } else { return NO_MORE_DOCS; } } @Override public int advance(int target) throws IOException { if (pos.skipTo(target)) { return pos.doc(); } else { return NO_MORE_DOCS; } } @Override public int freq() { return pos.freq(); } @Override public int docID() { return pos.doc(); } @Override public int nextPosition() throws IOException { return pos.nextPosition(); } @Override public boolean hasPayload() { return pos.isPayloadAvailable(); } private BytesRef payload; @Override public BytesRef getPayload() throws IOException { final int len = pos.getPayloadLength(); if (payload == null) { payload = new BytesRef(); payload.bytes = new byte[len]; } else { if (payload.bytes.length < len) { payload.grow(len); } } payload.bytes = pos.getPayload(payload.bytes, 0); payload.length = len; return payload; } } }