PreFlexFields.java example

Explorer
solrcene-master
package org.apache.lucene.index.codecs.preflex;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.Comparator;

import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.CompoundFileReader;
import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;

/** Exposes flex API on a pre-flex index, as a codec. 
 * @lucene.experimental */
public class PreFlexFields extends FieldsProducer {
  
  private static final boolean DEBUG_SURROGATES = false;

  public TermInfosReader tis;
  public final TermInfosReader tisNoIndex;

  public final IndexInput freqStream;
  public final IndexInput proxStream;
  final private FieldInfos fieldInfos;
  private final SegmentInfo si;
  final TreeMap<String,FieldInfo> fields = new TreeMap<String,FieldInfo>();
  private final Directory dir;
  private final int readBufferSize;
  private Directory cfsReader;

  public PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor)
    throws IOException {

    si = info;

    // NOTE: we must always load terms index, even for
    // "sequential" scan during merging, because what is
    // sequential to merger may not be to TermInfosReader
    // since we do the surrogates dance:
    if (indexDivisor < 0) {
      indexDivisor = -indexDivisor;
    }

    TermInfosReader r = new TermInfosReader(dir, info.name, fieldInfos, readBufferSize, indexDivisor);    
    if (indexDivisor == -1) {
      tisNoIndex = r;
    } else {
      tisNoIndex = null;
      tis = r;
    }
    this.readBufferSize = readBufferSize;
    this.fieldInfos = fieldInfos;

    // make sure that all index files have been read or are kept open
    // so that if an index update removes them we'll still have them
    freqStream = dir.openInput(info.name + ".frq", readBufferSize);
    boolean anyProx = false;
    final int numFields = fieldInfos.size();
    for(int i=0;i<numFields;i++) {
      final FieldInfo fieldInfo = fieldInfos.fieldInfo(i);
      if (fieldInfo.isIndexed) {
        fields.put(fieldInfo.name, fieldInfo);
        if (!fieldInfo.omitTermFreqAndPositions) {
          anyProx = true;
        }
      }
    }

    if (anyProx) {
      proxStream = dir.openInput(info.name + ".prx", readBufferSize);
    } else {
      proxStream = null;
    }

    this.dir = dir;
  }

  // If this returns, we do the surrogates dance so that the
  // terms are sorted by unicode sort order.  This should be
  // true when segments are used for "normal" searching;
  // it's only false during testing, to create a pre-flex
  // index, using the test-only PreFlexRW.
  protected boolean sortTermsByUnicode() {
    return true;
  }

  static void files(Directory dir, SegmentInfo info, Collection<String> files) throws IOException {
    files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.TERMS_EXTENSION));
    files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.TERMS_INDEX_EXTENSION));
    files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.FREQ_EXTENSION));
    if (info.getHasProx()) {
      // LUCENE-1739: for certain versions of 2.9-dev,
      // hasProx would be incorrectly computed during
      // indexing as true, and then stored into the segments
      // file, when it should have been false.  So we do the
      // extra check, here:
      final String prx = IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.PROX_EXTENSION);
      if (dir.fileExists(prx)) {
        files.add(prx);
      }
    }
  }

  @Override
  public FieldsEnum iterator() throws IOException {
    return new PreFlexFieldsEnum();
  }

  @Override
  public Terms terms(String field) {
    FieldInfo fi = fieldInfos.fieldInfo(field);
    if (fi != null) {
      return new PreTerms(fi);
    } else {
      return null;
    }
  }

  synchronized private TermInfosReader getTermsDict() {
    if (tis != null) {
      return tis;
    } else {
      return tisNoIndex;
    }
  }

  @Override
  synchronized public void loadTermsIndex(int indexDivisor) throws IOException {
    if (tis == null) {
      Directory dir0;
      if (si.getUseCompoundFile()) {
        // In some cases, we were originally opened when CFS
        // was not used, but then we are asked to open the
        // terms reader with index, the segment has switched
        // to CFS

        if (!(dir instanceof CompoundFileReader)) {
          dir0 = cfsReader = new CompoundFileReader(dir, IndexFileNames.segmentFileName(si.name, "", IndexFileNames.COMPOUND_FILE_EXTENSION), readBufferSize);
        } else {
          dir0 = dir;
        }
        dir0 = cfsReader;
      } else {
        dir0 = dir;
      }

      tis = new TermInfosReader(dir0, si.name, fieldInfos, readBufferSize, indexDivisor);
    }
  }

  @Override
  public void close() throws IOException {
    if (tis != null) {
      tis.close();
    }
    if (tisNoIndex != null) {
      tisNoIndex.close();
    }
    if (cfsReader != null) {
      cfsReader.close();
    }
    if (freqStream != null) {
      freqStream.close();
    }
    if (proxStream != null) {
      proxStream.close();
    }
  }

  private class PreFlexFieldsEnum extends FieldsEnum {
    final Iterator<FieldInfo> it;
    private final PreTermsEnum termsEnum;
    FieldInfo current;

    public PreFlexFieldsEnum() throws IOException {
      it = fields.values().iterator();
      termsEnum = new PreTermsEnum();
    }

    @Override
    public String next() {
      if (it.hasNext()) {
        current = it.next();
        return current.name;
      } else {
        return null;
      }
    }

    @Override
    public TermsEnum terms() throws IOException {
      termsEnum.reset(current);
      return termsEnum;
    }
  }
  
  private class PreTerms extends Terms {
    final FieldInfo fieldInfo;
    PreTerms(FieldInfo fieldInfo) {
      this.fieldInfo = fieldInfo;
    }

    @Override
    public TermsEnum iterator() throws IOException {    
      PreTermsEnum termsEnum = new PreTermsEnum();
      termsEnum.reset(fieldInfo);
      return termsEnum;
    }

    @Override
    public Comparator<BytesRef> getComparator() {
      // Pre-flex indexes always sorted in UTF16 order, but
      // we remap on-the-fly to unicode order
      if (sortTermsByUnicode()) {
        return BytesRef.getUTF8SortedAsUnicodeComparator();
      } else {
        return BytesRef.getUTF8SortedAsUTF16Comparator();
      }
    }
  }

  private class PreTermsEnum extends TermsEnum {
    private SegmentTermEnum termEnum;
    private FieldInfo fieldInfo;
    private boolean skipNext;
    private BytesRef current;

    private SegmentTermEnum seekTermEnum;
    private Term protoTerm;
    
    private static final byte UTF8_NON_BMP_LEAD = (byte) 0xf0;
    private static final byte UTF8_HIGH_BMP_LEAD = (byte) 0xee;

    // Returns true if the unicode char is "after" the
    // surrogates in UTF16, ie >= U+E000 and <= U+FFFF:
    private final boolean isHighBMPChar(byte[] b, int idx) {
      return (b[idx] & UTF8_HIGH_BMP_LEAD) == UTF8_HIGH_BMP_LEAD;
    }

    // Returns true if the unicode char in the UTF8 byte
    // sequence starting at idx encodes a char outside of
    // BMP (ie what would be a surrogate pair in UTF16):
    private final boolean isNonBMPChar(byte[] b, int idx) {
      return (b[idx] & UTF8_NON_BMP_LEAD) == UTF8_NON_BMP_LEAD;
    }

    private final byte[] scratch = new byte[4];
    private final BytesRef prevTerm = new BytesRef();
    private final BytesRef scratchTerm = new BytesRef();
    private int newSuffixStart;

    // Swap in S, in place of E:
    private boolean seekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) throws IOException {
      final int savLength = term.length;

      assert term.offset == 0;

      // The 3 bytes starting at downTo make up 1
      // unicode character:
      assert isHighBMPChar(term.bytes, pos);

      // NOTE: we cannot make this assert, because
      // AutomatonQuery legitimately sends us malformed UTF8
      // (eg the UTF8 bytes with just 0xee)
      // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString();

      // Save the bytes && length, since we need to
      // restore this if seek "back" finds no matching
      // terms
      if (term.bytes.length < 4+pos) {
        term.grow(4+pos);
      }

      scratch[0] = term.bytes[pos];
      scratch[1] = term.bytes[pos+1];
      scratch[2] = term.bytes[pos+2];

      term.bytes[pos] = (byte) 0xf0;
      term.bytes[pos+1] = (byte) 0x90;
      term.bytes[pos+2] = (byte) 0x80;
      term.bytes[pos+3] = (byte) 0x80;
      term.length = 4+pos;

      if (DEBUG_SURROGATES) {
        System.out.println("      try seek term=" + UnicodeUtil.toHexString(term.utf8ToString()));
      }

      // Seek "back":
      getTermsDict().seekEnum(te, protoTerm.createTerm(term));

      // Test if the term we seek'd to in fact found a
      // surrogate pair at the same position as the E:
      Term t2 = te.term();

      // Cannot be null (or move to next field) because at
      // "worst" it'd seek to the same term we are on now,
      // unless we are being called from seek
      if (t2 == null || t2.field() != fieldInfo.name) {
        return false;
      }

      if (DEBUG_SURROGATES) {
        System.out.println("      got term=" + UnicodeUtil.toHexString(t2.text()));
      }

      // Now test if prefix is identical and we found
      // a non-BMP char at the same position:
      BytesRef b2 = t2.bytes();
      assert b2.offset == 0;

      boolean matches;
      if (b2.length >= term.length && isNonBMPChar(b2.bytes, pos)) {
        matches = true;
        for(int i=0;i<pos;i++) {
          if (term.bytes[i] != b2.bytes[i]) {
            matches = false;
            break;
          }
        }              
      } else {
        matches = false;
      }

      // Restore term:
      term.length = savLength;
      term.bytes[pos] = scratch[0];
      term.bytes[pos+1] = scratch[1];
      term.bytes[pos+2] = scratch[2];

      return matches;
    }

    // Seek type 2 "continue" (back to the start of the
    // surrogates): scan the stripped suffix from the
    // prior term, backwards. If there was an E in that
    // part, then we try to seek back to S.  If that
    // seek finds a matching term, we go there.
    private boolean doContinue() throws IOException {

      if (DEBUG_SURROGATES) {
        System.out.println("  try cont");
      }

      int downTo = prevTerm.length-1;

      boolean didSeek = false;
      
      final int limit = Math.min(newSuffixStart, scratchTerm.length-1);

      while(downTo > limit) {

        if (isHighBMPChar(prevTerm.bytes, downTo)) {

          if (DEBUG_SURROGATES) {
            System.out.println("    found E pos=" + downTo + " vs len=" + prevTerm.length);
          }

          if (seekToNonBMP(seekTermEnum, prevTerm, downTo)) {
            // TODO: more efficient seek?
            getTermsDict().seekEnum(termEnum, seekTermEnum.term());
            //newSuffixStart = downTo+4;
            newSuffixStart = downTo;
            scratchTerm.copy(termEnum.term().bytes());
            didSeek = true;
            if (DEBUG_SURROGATES) {
              System.out.println("      seek!");
            }
            break;
          } else {
            if (DEBUG_SURROGATES) {
              System.out.println("      no seek");
            }
          }
        }

        // Shorten prevTerm in place so that we don't redo
        // this loop if we come back here:
        if ((prevTerm.bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.bytes[downTo] & 0x80) == 0) {
          prevTerm.length = downTo;
        }
        
        downTo--;
      }

      return didSeek;
    }

    // Look for seek type 3 ("pop"): if the delta from
    // prev -> current was replacing an S with an E,
    // we must now seek to beyond that E.  This seek
    // "finishes" the dance at this character
    // position.
    private boolean doPop() throws IOException {

      if (DEBUG_SURROGATES) {
        System.out.println("  try pop");
      }

      assert newSuffixStart <= prevTerm.length;
      assert newSuffixStart < scratchTerm.length || newSuffixStart == 0;

      if (prevTerm.length > newSuffixStart &&
          isNonBMPChar(prevTerm.bytes, newSuffixStart) &&
          isHighBMPChar(scratchTerm.bytes, newSuffixStart)) {

        // Seek type 2 -- put 0xFF at this position:
        scratchTerm.bytes[newSuffixStart] = (byte) 0xff;
        scratchTerm.length = newSuffixStart+1;

        if (DEBUG_SURROGATES) {
          System.out.println("    seek to term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString());
        }
          
        // TODO: more efficient seek?  can we simply swap
        // the enums?
        getTermsDict().seekEnum(termEnum, protoTerm.createTerm(scratchTerm));

        final Term t2 = termEnum.term();

        // We could hit EOF or different field since this
        // was a seek "forward":
        if (t2 != null && t2.field() == fieldInfo.name) {

          if (DEBUG_SURROGATES) {
            System.out.println("      got term=" + UnicodeUtil.toHexString(t2.text()) + " " + t2.bytes());
          }

          final BytesRef b2 = t2.bytes();
          assert b2.offset == 0;


          // Set newSuffixStart -- we can't use
          // termEnum's since the above seek may have
          // done no scanning (eg, term was precisely
          // and index term, or, was in the term seek
          // cache):
          scratchTerm.copy(b2);
          setNewSuffixStart(prevTerm, scratchTerm);

          return true;
        } else if (newSuffixStart != 0 || scratchTerm.length != 0) {
          if (DEBUG_SURROGATES) {
            System.out.println("      got term=null (or next field)");
          }
          newSuffixStart = 0;
          scratchTerm.length = 0;
          return true;
        }
      }

      return false;
    }

    // Pre-flex indices store terms in UTF16 sort order, but
    // certain queries require Unicode codepoint order; this
    // method carefully seeks around surrogates to handle
    // this impedance mismatch

    private void surrogateDance() throws IOException {

      if (!unicodeSortOrder) {
        return;
      }

      // We are invoked after TIS.next() (by UTF16 order) to
      // possibly seek to a different "next" (by unicode
      // order) term.

      // We scan only the "delta" from the last term to the
      // current term, in UTF8 bytes.  We look at 1) the bytes
      // stripped from the prior term, and then 2) the bytes
      // appended to that prior term's prefix.
    
      // We don't care about specific UTF8 sequences, just
      // the "category" of the UTF16 character.  Category S
      // is a high/low surrogate pair (it non-BMP).
      // Category E is any BMP char > UNI_SUR_LOW_END (and <
      // U+FFFF). Category A is the rest (any unicode char
      // <= UNI_SUR_HIGH_START).

      // The core issue is that pre-flex indices sort the
      // characters as ASE, while flex must sort as AES.  So
      // when scanning, when we hit S, we must 1) seek
      // forward to E and enum the terms there, then 2) seek
      // back to S and enum all terms there, then 3) seek to
      // after E.  Three different seek points (1, 2, 3).
    
      // We can easily detect S in UTF8: if a byte has
      // prefix 11110 (0xf0), then that byte and the
      // following 3 bytes encode a single unicode codepoint
      // in S.  Similary,we can detect E: if a byte has
      // prefix 1110111 (0xee), then that byte and the
      // following 2 bytes encode a single unicode codepoint
      // in E.

      // Note that this is really a recursive process --
      // maybe the char at pos 2 needs to dance, but any
      // point in its dance, suddenly pos 4 needs to dance
      // so you must finish pos 4 before returning to pos
      // 2.  But then during pos 4's dance maybe pos 7 needs
      // to dance, etc.  However, despite being recursive,
      // we don't need to hold any state because the state
      // can always be derived by looking at prior term &
      // current term.

      // TODO: can we avoid this copy?
      if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) {
        scratchTerm.length = 0;
      } else {
        scratchTerm.copy(termEnum.term().bytes());
      }
      
      if (DEBUG_SURROGATES) {
        System.out.println("  dance");
        System.out.println("    prev=" + UnicodeUtil.toHexString(prevTerm.utf8ToString()));
        System.out.println("         " + prevTerm.toString());
        System.out.println("    term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()));
        System.out.println("         " + scratchTerm.toString());
      }

      // This code assumes TermInfosReader/SegmentTermEnum
      // always use BytesRef.offset == 0
      assert prevTerm.offset == 0;
      assert scratchTerm.offset == 0;

      // Need to loop here because we may need to do multiple
      // pops, and possibly a continue in the end, ie:
      //
      //  cont
      //  pop, cont
      //  pop, pop, cont
      //  <nothing>
      //

      while(true) {
        if (doContinue()) {
          break;
        } else {
          if (!doPop()) {
            break;
          }
        }
      }

      if (DEBUG_SURROGATES) {
        System.out.println("  finish bmp ends");
      }

      doPushes();
    }


    // Look for seek type 1 ("push"): if the newly added
    // suffix contains any S, we must try to seek to the
    // corresponding E.  If we find a match, we go there;
    // else we keep looking for additional S's in the new
    // suffix.  This "starts" the dance, at this character
    // position:
    private void doPushes() throws IOException {

      int upTo = newSuffixStart;
      if (DEBUG_SURROGATES) {
        System.out.println("  try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.length);
      }

      while(upTo < scratchTerm.length) {
        if (isNonBMPChar(scratchTerm.bytes, upTo) &&
            (upTo > newSuffixStart ||
             (upTo >= prevTerm.length ||
              (!isNonBMPChar(prevTerm.bytes, upTo) &&
               !isHighBMPChar(prevTerm.bytes, upTo))))) {

          // A non-BMP char (4 bytes UTF8) starts here:
          assert scratchTerm.length >= upTo + 4;
          
          final int savLength = scratchTerm.length;
          scratch[0] = scratchTerm.bytes[upTo];
          scratch[1] = scratchTerm.bytes[upTo+1];
          scratch[2] = scratchTerm.bytes[upTo+2];

          scratchTerm.bytes[upTo] = UTF8_HIGH_BMP_LEAD;
          scratchTerm.bytes[upTo+1] = (byte) 0x80;
          scratchTerm.bytes[upTo+2] = (byte) 0x80;
          scratchTerm.length = upTo+3;

          if (DEBUG_SURROGATES) {
            System.out.println("    try seek 1 pos=" + upTo + " term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString() + " len=" + scratchTerm.length);
          }

          // Seek "forward":
          // TODO: more efficient seek?
          getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(scratchTerm));

          scratchTerm.bytes[upTo] = scratch[0];
          scratchTerm.bytes[upTo+1] = scratch[1];
          scratchTerm.bytes[upTo+2] = scratch[2];
          scratchTerm.length = savLength;

          // Did we find a match?
          final Term t2 = seekTermEnum.term();
            
          if (DEBUG_SURROGATES) {
            if (t2 == null) {
              System.out.println("      hit term=null");
            } else {
              System.out.println("      hit term=" + UnicodeUtil.toHexString(t2.text()) + " " + (t2==null? null:t2.bytes()));
            }
          }

          // Since this was a seek "forward", we could hit
          // EOF or a different field:
          boolean matches;

          if (t2 != null && t2.field() == fieldInfo.name) {
            final BytesRef b2 = t2.bytes();
            assert b2.offset == 0;
            if (b2.length >= upTo+3 && isHighBMPChar(b2.bytes, upTo)) {
              matches = true;
              for(int i=0;i<upTo;i++) {
                if (scratchTerm.bytes[i] != b2.bytes[i]) {
                  matches = false;
                  break;
                }
              }              
                
            } else {
              matches = false;
            }
          } else {
            matches = false;
          }

          if (matches) {

            if (DEBUG_SURROGATES) {
              System.out.println("      matches!");
            }

            // OK seek "back"
            // TODO: more efficient seek?
            getTermsDict().seekEnum(termEnum, seekTermEnum.term());

            scratchTerm.copy(seekTermEnum.term().bytes());

            // +3 because we don't need to check the char
            // at upTo: we know it's > BMP
            upTo += 3;

            // NOTE: we keep iterating, now, since this
            // can easily "recurse".  Ie, after seeking
            // forward at a certain char position, we may
            // find another surrogate in our [new] suffix
            // and must then do another seek (recurse)
          } else {
            upTo++;
          }
        } else {
          upTo++;
        }
      }
    }

    private boolean unicodeSortOrder;

    void reset(FieldInfo fieldInfo) throws IOException {
      //System.out.println("pff.reset te=" + termEnum);
      this.fieldInfo = fieldInfo;
      protoTerm = new Term(fieldInfo.name);
      if (termEnum == null) {
        termEnum = getTermsDict().terms(protoTerm);
        seekTermEnum = getTermsDict().terms(protoTerm);
        //System.out.println("  term=" + termEnum.term());
      } else {
        getTermsDict().seekEnum(termEnum, protoTerm);
      }
      skipNext = true;

      unicodeSortOrder = sortTermsByUnicode();

      final Term t = termEnum.term();
      if (t != null && t.field() == fieldInfo.name) {
        newSuffixStart = 0;
        prevTerm.length = 0;
        surrogateDance();
      }
    }

    @Override
    public Comparator<BytesRef> getComparator() {
      // Pre-flex indexes always sorted in UTF16 order, but
      // we remap on-the-fly to unicode order
      if (unicodeSortOrder) {
        return BytesRef.getUTF8SortedAsUnicodeComparator();
      } else {
        return BytesRef.getUTF8SortedAsUTF16Comparator();
      }
    }

    @Override
    public SeekStatus seek(long ord) throws IOException {
      throw new UnsupportedOperationException();
    }

    @Override
    public long ord() throws IOException {
      throw new UnsupportedOperationException();
    }

    @Override
    public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
      if (DEBUG_SURROGATES) {
        System.out.println("TE.seek target=" + UnicodeUtil.toHexString(term.utf8ToString()));
      }
      skipNext = false;
      final TermInfosReader tis = getTermsDict();
      final Term t0 = protoTerm.createTerm(term);

      assert termEnum != null;

      tis.seekEnum(termEnum, t0);

      final Term t = termEnum.term();

      if (t != null && t.field() == fieldInfo.name && term.bytesEquals(t.bytes())) {
        // If we found an exact match, no need to do the
        // surrogate dance
        if (DEBUG_SURROGATES) {
          System.out.println("  seek exact match");
        }
        current = t.bytes();
        return SeekStatus.FOUND;
      } else if (t == null || t.field() != fieldInfo.name) {

        // TODO: maybe we can handle this like the next()
        // into null?  set term as prevTerm then dance?

        if (DEBUG_SURROGATES) {
          System.out.println("  seek hit EOF");
        }

        // We hit EOF; try end-case surrogate dance: if we
        // find an E, try swapping in S, backwards:
        scratchTerm.copy(term);

        assert scratchTerm.offset == 0;

        for(int i=scratchTerm.length-1;i>=0;i--) {
          if (isHighBMPChar(scratchTerm.bytes, i)) {
            if (DEBUG_SURROGATES) {
              System.out.println("    found E pos=" + i + "; try seek");
            }

            if (seekToNonBMP(seekTermEnum, scratchTerm, i)) {

              scratchTerm.copy(seekTermEnum.term().bytes());
              getTermsDict().seekEnum(termEnum, seekTermEnum.term());

              newSuffixStart = 1+i;

              doPushes();

              // Found a match
              // TODO: faster seek?
              current = termEnum.term().bytes();
              return SeekStatus.NOT_FOUND;
            }
          }
        }
        
        if (DEBUG_SURROGATES) {
          System.out.println("  seek END");
        }

        current = null;
        return SeekStatus.END;
      } else {

        // We found a non-exact but non-null term; this one
        // is fun -- just treat it like next, by pretending
        // requested term was prev:
        prevTerm.copy(term);

        if (DEBUG_SURROGATES) {
          System.out.println("  seek hit non-exact term=" + UnicodeUtil.toHexString(t.text()));
        }

        final BytesRef br = t.bytes();
        assert br.offset == 0;

        setNewSuffixStart(term, br);

        surrogateDance();

        final Term t2 = termEnum.term();
        if (t2 == null || t2.field() != fieldInfo.name) {
          assert t2 == null || !t2.field().equals(fieldInfo.name); // make sure fields are in fact interned
          current = null;
          return SeekStatus.END;
        } else {
          current = t2.bytes();
          assert !unicodeSortOrder || term.compareTo(current) < 0 : "term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " vs current=" + UnicodeUtil.toHexString(current.utf8ToString());
          return SeekStatus.NOT_FOUND;
        }
      }
    }

    private void setNewSuffixStart(BytesRef br1, BytesRef br2) {
      final int limit = Math.min(br1.length, br2.length);
      int lastStart = 0;
      for(int i=0;i<limit;i++) {
        if ((br1.bytes[br1.offset+i] & 0xc0) == 0xc0 || (br1.bytes[br1.offset+i] & 0x80) == 0) {
          lastStart = i;
        }
        if (br1.bytes[br1.offset+i] != br2.bytes[br2.offset+i]) {
          newSuffixStart = lastStart;
          if (DEBUG_SURROGATES) {
            System.out.println("    set newSuffixStart=" + newSuffixStart);
          }
          return;
        }
      }
      newSuffixStart = limit;
      if (DEBUG_SURROGATES) {
        System.out.println("    set newSuffixStart=" + newSuffixStart);
      }
    }

    @Override
    public BytesRef next() throws IOException {
      if (DEBUG_SURROGATES) {
        System.out.println("TE.next()");
      }
      if (skipNext) {
        if (DEBUG_SURROGATES) {
          System.out.println("  skipNext=true");
        }
        skipNext = false;
        if (termEnum.term() == null) {
          return null;
        } else if (termEnum.term().field() != fieldInfo.name) {
          return null;
        } else {
          return current = termEnum.term().bytes();
        }
      }

      // TODO: can we use STE's prevBuffer here?
      prevTerm.copy(termEnum.term().bytes());

      if (termEnum.next() && termEnum.term().field() == fieldInfo.name) {
        newSuffixStart = termEnum.newSuffixStart;
        if (DEBUG_SURROGATES) {
          System.out.println("  newSuffixStart=" + newSuffixStart);
        }
        surrogateDance();
        final Term t = termEnum.term();
        if (t == null || t.field() != fieldInfo.name) {
          assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
          current = null;
        } else {
          current = t.bytes();
        }
        return current;
      } else {
        // This field is exhausted, but we have to give
        // surrogateDance a chance to seek back:
        if (DEBUG_SURROGATES) {
          System.out.println("  force cont");
        }
        //newSuffixStart = prevTerm.length;
        newSuffixStart = 0;
        surrogateDance();
        
        final Term t = termEnum.term();
        if (t == null || t.field() != fieldInfo.name) {
          assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
          return null;
        } else {
          current = t.bytes();
          return current;
        }
      }
    }

    @Override
    public BytesRef term() {
      return current;
    }

    @Override
    public int docFreq() {
      return termEnum.docFreq();
    }

    @Override
    public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
      PreDocsEnum docsEnum;
      if (reuse == null || !(reuse instanceof PreDocsEnum)) {
        docsEnum = new PreDocsEnum();
      } else {
        docsEnum = (PreDocsEnum) reuse;
        if (docsEnum.getFreqStream() != freqStream) {
          docsEnum = new PreDocsEnum();
        }
      }
      return docsEnum.reset(termEnum, skipDocs);
    }

    @Override
    public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
      PreDocsAndPositionsEnum docsPosEnum;
      if (fieldInfo.omitTermFreqAndPositions) {
        return null;
      } else if (reuse == null || !(reuse instanceof PreDocsAndPositionsEnum)) {
        docsPosEnum = new PreDocsAndPositionsEnum();
      } else {
        docsPosEnum = (PreDocsAndPositionsEnum) reuse;
        if (docsPosEnum.getFreqStream() != freqStream) {
          docsPosEnum = new PreDocsAndPositionsEnum();
        }
      }
      return docsPosEnum.reset(termEnum, skipDocs);        
    }
  }

  private final class PreDocsEnum extends DocsEnum {
    final private SegmentTermDocs docs;

    PreDocsEnum() throws IOException {
      docs = new SegmentTermDocs(freqStream, getTermsDict(), fieldInfos);
    }

    IndexInput getFreqStream() {
      return freqStream;
    }

    public PreDocsEnum reset(SegmentTermEnum termEnum, Bits skipDocs) throws IOException {
      docs.setSkipDocs(skipDocs);
      docs.seek(termEnum);
      return this;
    }

    @Override
    public int nextDoc() throws IOException {
      if (docs.next()) {
        return docs.doc();
      } else {
        return NO_MORE_DOCS;
      }
    }

    @Override
    public int advance(int target) throws IOException {
      if (docs.skipTo(target)) {
        return docs.doc();
      } else {
        return NO_MORE_DOCS;
      }
    }

    @Override
    public int freq() {
      return docs.freq();
    }

    @Override
    public int docID() {
      return docs.doc();
    }

    @Override
    public int read() throws IOException {
      if (bulkResult == null) {
        initBulkResult();
        bulkResult.docs.ints = new int[32];
        bulkResult.freqs.ints = new int[32];
      }
      return this.docs.read(bulkResult.docs.ints, bulkResult.freqs.ints);
    }
  }

  private final class PreDocsAndPositionsEnum extends DocsAndPositionsEnum {
    final private SegmentTermPositions pos;

    PreDocsAndPositionsEnum() throws IOException {
      pos = new SegmentTermPositions(freqStream, proxStream, getTermsDict(), fieldInfos);
    }

    IndexInput getFreqStream() {
      return freqStream;
    }

    public DocsAndPositionsEnum reset(SegmentTermEnum termEnum, Bits skipDocs) throws IOException {
      pos.setSkipDocs(skipDocs);
      pos.seek(termEnum);
      return this;
    }

    @Override
    public int nextDoc() throws IOException {
      if (pos.next()) {
        return pos.doc();
      } else {
        return NO_MORE_DOCS;
      }
    }

    @Override
    public int advance(int target) throws IOException {
      if (pos.skipTo(target)) {
        return pos.doc();
      } else {
        return NO_MORE_DOCS;
      }
    }

    @Override
    public int freq() {
      return pos.freq();
    }

    @Override
    public int docID() {
      return pos.doc();
    }

    @Override
    public int nextPosition() throws IOException {
      return pos.nextPosition();
    }

    @Override
    public boolean hasPayload() {
      return pos.isPayloadAvailable();
    }

    private BytesRef payload;

    @Override
    public BytesRef getPayload() throws IOException {
      final int len = pos.getPayloadLength();
      if (payload == null) {
        payload = new BytesRef();
        payload.bytes = new byte[len];
      } else {
        if (payload.bytes.length < len) {
          payload.grow(len);
        }
      }
      
      payload.bytes = pos.getPayload(payload.bytes, 0);
      payload.length = len;
      return payload;
    }
  }
}