Bytes.java example

Explorer
solr-analytics-master
- lucene
- solr
package org.apache.lucene.codecs.lucene40.values;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/** Base class for specific Bytes Reader/Writer implementations */
import java.io.IOException;
import java.util.Comparator;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.index.DocValues.SortedSource;
import org.apache.lucene.index.DocValues.Source;
import org.apache.lucene.index.DocValues.Type;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool.Allocator;
import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash.TrackingDirectBytesStartArray;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.PackedInts;

/**
 * Provides concrete Writer/Reader implementations for <tt>byte[]</tt> value per
 * document. There are 6 package-private default implementations of this, for
 * all combinations of {@link Mode#DEREF}/{@link Mode#STRAIGHT} x fixed-length/variable-length.
 * 
 * <p>
 * NOTE: Currently the total amount of byte[] data stored (across a single
 * segment) cannot exceed 2GB.
 * </p>
 * <p>
 * NOTE: Each byte[] must be <= 32768 bytes in length
 * </p>
 * 
 * @lucene.experimental
 */
public final class Bytes {

  static final String DV_SEGMENT_SUFFIX = "dv";

  // TODO - add bulk copy where possible
  private Bytes() { /* don't instantiate! */
  }

  /**
   * Defines the {@link Writer}s store mode. The writer will either store the
   * bytes sequentially ({@link #STRAIGHT}, dereferenced ({@link #DEREF}) or
   * sorted ({@link #SORTED})
   * 
   * @lucene.experimental
   */
  public static enum Mode {
    /**
     * Mode for sequentially stored bytes
     */
    STRAIGHT,
    /**
     * Mode for dereferenced stored bytes
     */
    DEREF,
    /**
     * Mode for sorted stored bytes
     */
    SORTED
  };

  /**
   * Creates a new <tt>byte[]</tt> {@link Writer} instances for the given
   * directory.
   * 
   * @param dir
   *          the directory to write the values to
   * @param id
   *          the id used to create a unique file name. Usually composed out of
   *          the segment name and a unique id per segment.
   * @param mode
   *          the writers store mode
   * @param fixedSize
   *          <code>true</code> if all bytes subsequently passed to the
   *          {@link Writer} will have the same length
   * @param sortComparator {@link BytesRef} comparator used by sorted variants. 
   *        If <code>null</code> {@link BytesRef#getUTF8SortedAsUnicodeComparator()}
   *        is used instead
   * @param bytesUsed
   *          an {@link AtomicLong} instance to track the used bytes within the
   *          {@link Writer}. A call to {@link Writer#finish(int)} will release
   *          all internally used resources and frees the memory tracking
   *          reference.
   * @param acceptableOverheadRatio
   *          how to trade space for speed. This option is only applicable for
   *          docvalues of type {@link Type#BYTES_FIXED_SORTED} and
   *          {@link Type#BYTES_VAR_SORTED}.
   * @param context I/O Context
   * @return a new {@link Writer} instance
   * @see PackedInts#getReader(org.apache.lucene.store.DataInput)
   */
  public static DocValuesConsumer getWriter(Directory dir, String id, Mode mode,
      boolean fixedSize, Comparator<BytesRef> sortComparator,
      Counter bytesUsed, IOContext context, float acceptableOverheadRatio) {
    // TODO -- i shouldn't have to specify fixed? can
    // track itself & do the write thing at write time?
    if (sortComparator == null) {
      sortComparator = BytesRef.getUTF8SortedAsUnicodeComparator();
    }

    if (fixedSize) {
      if (mode == Mode.STRAIGHT) {
        return new FixedStraightBytesImpl.Writer(dir, id, bytesUsed, context);
      } else if (mode == Mode.DEREF) {
        return new FixedDerefBytesImpl.Writer(dir, id, bytesUsed, context);
      } else if (mode == Mode.SORTED) {
        return new FixedSortedBytesImpl.Writer(dir, id, sortComparator, bytesUsed, context, acceptableOverheadRatio);
      }
    } else {
      if (mode == Mode.STRAIGHT) {
        return new VarStraightBytesImpl.Writer(dir, id, bytesUsed, context);
      } else if (mode == Mode.DEREF) {
        return new VarDerefBytesImpl.Writer(dir, id, bytesUsed, context);
      } else if (mode == Mode.SORTED) {
        return new VarSortedBytesImpl.Writer(dir, id, sortComparator, bytesUsed, context, acceptableOverheadRatio);
      }
    }

    throw new IllegalArgumentException("");
  }

  /**
   * Creates a new {@link DocValues} instance that provides either memory
   * resident or iterative access to a per-document stored <tt>byte[]</tt>
   * value. The returned {@link DocValues} instance will be initialized without
   * consuming a significant amount of memory.
   * 
   * @param dir
   *          the directory to load the {@link DocValues} from.
   * @param id
   *          the file ID in the {@link Directory} to load the values from.
   * @param mode
   *          the mode used to store the values
   * @param fixedSize
   *          <code>true</code> iff the values are stored with fixed-size,
   *          otherwise <code>false</code>
   * @param maxDoc
   *          the number of document values stored for the given ID
   * @param sortComparator {@link BytesRef} comparator used by sorted variants. 
   *        If <code>null</code> {@link BytesRef#getUTF8SortedAsUnicodeComparator()}
   *        is used instead
   * @return an initialized {@link DocValues} instance.
   * @throws IOException
   *           if an {@link IOException} occurs
   */
  public static DocValues getValues(Directory dir, String id, Mode mode,
      boolean fixedSize, int maxDoc, Comparator<BytesRef> sortComparator, IOContext context) throws IOException {
    if (sortComparator == null) {
      sortComparator = BytesRef.getUTF8SortedAsUnicodeComparator();
    }
    // TODO -- I can peek @ header to determing fixed/mode?
    if (fixedSize) {
      if (mode == Mode.STRAIGHT) {
        return new FixedStraightBytesImpl.FixedStraightReader(dir, id, maxDoc, context);
      } else if (mode == Mode.DEREF) {
        return new FixedDerefBytesImpl.FixedDerefReader(dir, id, maxDoc, context);
      } else if (mode == Mode.SORTED) {
        return new FixedSortedBytesImpl.Reader(dir, id, maxDoc, context, Type.BYTES_FIXED_SORTED, sortComparator);
      }
    } else {
      if (mode == Mode.STRAIGHT) {
        return new VarStraightBytesImpl.VarStraightReader(dir, id, maxDoc, context);
      } else if (mode == Mode.DEREF) {
        return new VarDerefBytesImpl.VarDerefReader(dir, id, maxDoc, context);
      } else if (mode == Mode.SORTED) {
        return new VarSortedBytesImpl.Reader(dir, id, maxDoc,context, Type.BYTES_VAR_SORTED, sortComparator);
      }
    }

    throw new IllegalArgumentException("Illegal Mode: " + mode);
  }

  // TODO open up this API?
  static abstract class BytesSourceBase extends Source {
    private final PagedBytes pagedBytes;
    protected final IndexInput datIn;
    protected final IndexInput idxIn;
    protected final static int PAGED_BYTES_BITS = 15;
    protected final PagedBytes.Reader data;
    protected final long totalLengthInBytes;
    

    protected BytesSourceBase(IndexInput datIn, IndexInput idxIn,
        PagedBytes pagedBytes, long bytesToRead, Type type) throws IOException {
      super(type);
      assert bytesToRead <= datIn.length() : " file size is less than the expected size diff: "
          + (bytesToRead - datIn.length()) + " pos: " + datIn.getFilePointer();
      this.datIn = datIn;
      this.totalLengthInBytes = bytesToRead;
      this.pagedBytes = pagedBytes;
      this.pagedBytes.copy(datIn, bytesToRead);
      data = pagedBytes.freeze(true);
      this.idxIn = idxIn;
    }
  }
  
  // TODO: open up this API?!
  static abstract class BytesWriterBase extends Writer {
    private final String id;
    private IndexOutput idxOut;
    private IndexOutput datOut;
    protected BytesRef bytesRef = new BytesRef();
    private final Directory dir;
    private final String codecNameIdx;
    private final String codecNameDat;
    private final int version;
    private final IOContext context;

    protected BytesWriterBase(Directory dir, String id, String codecNameIdx, String codecNameDat,
        int version, Counter bytesUsed, IOContext context, Type type) {
      super(bytesUsed, type);
      this.id = id;
      this.dir = dir;
      this.codecNameIdx = codecNameIdx;
      this.codecNameDat = codecNameDat;
      this.version = version;
      this.context = context;
      assert codecNameDat != null || codecNameIdx != null: "both codec names are null";
      assert (codecNameDat != null && !codecNameDat.equals(codecNameIdx)) 
      || (codecNameIdx != null && !codecNameIdx.equals(codecNameDat)):
        "index and data codec names must not be equal";
    }
    
    protected IndexOutput getOrCreateDataOut() throws IOException {
      if (datOut == null) {
        boolean success = false;
        assert codecNameDat != null;
        try {
          datOut = dir.createOutput(IndexFileNames.segmentFileName(id, DV_SEGMENT_SUFFIX,
              DocValuesWriterBase.DATA_EXTENSION), context);
          CodecUtil.writeHeader(datOut, codecNameDat, version);
          success = true;
        } finally {
          if (!success) {
            IOUtils.closeWhileHandlingException(datOut);
          }
        }
      }
      return datOut;
    }
    
    protected IndexOutput getIndexOut() {
      return idxOut;
    }
    
    protected IndexOutput getDataOut() {
      return datOut;
    }

    protected IndexOutput getOrCreateIndexOut() throws IOException {
      boolean success = false;
      try {
        if (idxOut == null) {
          assert codecNameIdx != null;
          idxOut = dir.createOutput(IndexFileNames.segmentFileName(id, DV_SEGMENT_SUFFIX,
              DocValuesWriterBase.INDEX_EXTENSION), context);
          CodecUtil.writeHeader(idxOut, codecNameIdx, version);
        }
        success = true;
      } finally {
        if (!success) {
          IOUtils.closeWhileHandlingException(idxOut);
        }
      }
      return idxOut;
    }


    @Override
    public abstract void finish(int docCount) throws IOException;

  }

  /**
   * Opens all necessary files, but does not read any data in until you call
   * {@link #load}.
   */
  static abstract class BytesReaderBase extends DocValues {
    protected final IndexInput idxIn;
    protected final IndexInput datIn;
    protected final int version;
    protected final String id;
    protected final Type type;
    
    protected BytesReaderBase(Directory dir, String id, String codecNameIdx, String codecNameDat,
        int maxVersion, boolean doIndex, IOContext context, Type type) throws IOException {
      IndexInput dataIn = null;
      IndexInput indexIn = null;
      boolean success = false;
      try {
        dataIn = dir.openInput(IndexFileNames.segmentFileName(id, DV_SEGMENT_SUFFIX,
                                                              DocValuesWriterBase.DATA_EXTENSION), context);
        version = CodecUtil.checkHeader(dataIn, codecNameDat, maxVersion, maxVersion);
        if (doIndex) {
          indexIn = dir.openInput(IndexFileNames.segmentFileName(id, DV_SEGMENT_SUFFIX,
                                                                 DocValuesWriterBase.INDEX_EXTENSION), context);
          final int version2 = CodecUtil.checkHeader(indexIn, codecNameIdx,
                                                     maxVersion, maxVersion);
          assert version == version2;
        }
        success = true;
      } finally {
        if (!success) {
          IOUtils.closeWhileHandlingException(dataIn, indexIn);
        }
      }
      datIn = dataIn;
      idxIn = indexIn;
      this.type = type;
      this.id = id;
    }

    /**
     * clones and returns the data {@link IndexInput}
     */
    protected final IndexInput cloneData() {
      assert datIn != null;
      return datIn.clone();
    }

    /**
     * clones and returns the indexing {@link IndexInput}
     */
    protected final IndexInput cloneIndex() {
      assert idxIn != null;
      return idxIn.clone();
    }

    @Override
    public void close() throws IOException {
      try {
        super.close();
      } finally {
         IOUtils.close(datIn, idxIn);
      }
    }

    @Override
    public Type getType() {
      return type;
    }
    
  }
  
  static abstract class DerefBytesWriterBase extends BytesWriterBase {
    protected int size = -1;
    protected int lastDocId = -1;
    protected int[] docToEntry;
    protected final BytesRefHash hash;
    protected final float acceptableOverheadRatio;
    protected long maxBytes = 0;
    
    protected DerefBytesWriterBase(Directory dir, String id, String codecNameIdx, String codecNameDat,
        int codecVersion, Counter bytesUsed, IOContext context, Type type) {
      this(dir, id, codecNameIdx, codecNameDat, codecVersion, new DirectTrackingAllocator(
          ByteBlockPool.BYTE_BLOCK_SIZE, bytesUsed), bytesUsed, context, PackedInts.DEFAULT, type);
    }

    protected DerefBytesWriterBase(Directory dir, String id, String codecNameIdx, String codecNameDat,
                                   int codecVersion, Counter bytesUsed, IOContext context, float acceptableOverheadRatio, Type type) {
      this(dir, id, codecNameIdx, codecNameDat, codecVersion, new DirectTrackingAllocator(
          ByteBlockPool.BYTE_BLOCK_SIZE, bytesUsed), bytesUsed, context, acceptableOverheadRatio, type);
    }

    protected DerefBytesWriterBase(Directory dir, String id, String codecNameIdx, String codecNameDat, int codecVersion, Allocator allocator,
        Counter bytesUsed, IOContext context, float acceptableOverheadRatio, Type type) {
      super(dir, id, codecNameIdx, codecNameDat, codecVersion, bytesUsed, context, type);
      hash = new BytesRefHash(new ByteBlockPool(allocator),
          BytesRefHash.DEFAULT_CAPACITY, new TrackingDirectBytesStartArray(
              BytesRefHash.DEFAULT_CAPACITY, bytesUsed));
      docToEntry = new int[1];
      bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT);
      this.acceptableOverheadRatio = acceptableOverheadRatio;
    }
    
    protected static int writePrefixLength(DataOutput datOut, BytesRef bytes)
        throws IOException {
      if (bytes.length < 128) {
        datOut.writeByte((byte) bytes.length);
        return 1;
      } else {
        datOut.writeByte((byte) (0x80 | (bytes.length >> 8)));
        datOut.writeByte((byte) (bytes.length & 0xff));
        return 2;
      }
    }

    @Override
    public void add(int docID, IndexableField value) throws IOException {
      BytesRef bytes = value.binaryValue();
      assert bytes != null;
      if (bytes.length == 0) { // default value - skip it
        return;
      }
      checkSize(bytes);
      fillDefault(docID);
      int ord = hash.add(bytes);
      if (ord < 0) {
        ord = (-ord) - 1;
      } else {
        maxBytes += bytes.length;
      }
      
      
      docToEntry[docID] = ord;
      lastDocId = docID;
    }
    
    protected void fillDefault(int docID) {
      if (docID >= docToEntry.length) {
        final int size = docToEntry.length;
        docToEntry = ArrayUtil.grow(docToEntry, 1 + docID);
        bytesUsed.addAndGet((docToEntry.length - size)
            * RamUsageEstimator.NUM_BYTES_INT);
      }
      assert size >= 0;
      BytesRef ref = new BytesRef(size);
      ref.length = size;
      int ord = hash.add(ref);
      if (ord < 0) {
        ord = (-ord) - 1;
      }
      for (int i = lastDocId+1; i < docID; i++) {
        docToEntry[i] = ord;
      }
    }
    
    protected void checkSize(BytesRef bytes) {
      if (size == -1) {
        size = bytes.length;
      } else if (bytes.length != size) {
        throw new IllegalArgumentException("expected bytes size=" + size
            + " but got " + bytes.length);
      }
    }
    
    public int getValueSize() {
      return size;
    }
    
    // Important that we get docCount, in case there were
    // some last docs that we didn't see
    @Override
    public void finish(int docCount) throws IOException {
      boolean success = false;
      try {
        finishInternal(docCount);
        success = true;
      } finally {
        releaseResources();
        if (success) {
          IOUtils.close(getIndexOut(), getDataOut());
        } else {
          IOUtils.closeWhileHandlingException(getIndexOut(), getDataOut());
        }
        
      }
    }
    
    protected abstract void finishInternal(int docCount) throws IOException;
    
    protected void releaseResources() {
      hash.close();
      bytesUsed.addAndGet((-docToEntry.length) * RamUsageEstimator.NUM_BYTES_INT);
      docToEntry = null;
    }
    
    protected void writeIndex(IndexOutput idxOut, int docCount,
        long maxValue, int[] toEntry) throws IOException {
      writeIndex(idxOut, docCount, maxValue, (int[])null, toEntry);
    }
    
    protected void writeIndex(IndexOutput idxOut, int docCount,
        long maxValue, int[] addresses, int[] toEntry) throws IOException {
      final PackedInts.Writer w = PackedInts.getWriter(idxOut, docCount,
          PackedInts.bitsRequired(maxValue), acceptableOverheadRatio);
      final int limit = docCount > docToEntry.length ? docToEntry.length
          : docCount;
      assert toEntry.length >= limit -1;
      if (addresses != null) {
        for (int i = 0; i < limit; i++) {
          assert addresses[toEntry[i]] >= 0;
          w.add(addresses[toEntry[i]]);
        }
      } else {
        for (int i = 0; i < limit; i++) {
          assert toEntry[i] >= 0;
          w.add(toEntry[i]);
        }
      }
      for (int i = limit; i < docCount; i++) {
        w.add(0);
      }
      w.finish();
    }
    
    protected void writeIndex(IndexOutput idxOut, int docCount,
        long maxValue, long[] addresses, int[] toEntry) throws IOException {
      final PackedInts.Writer w = PackedInts.getWriter(idxOut, docCount,
          PackedInts.bitsRequired(maxValue), acceptableOverheadRatio);
      final int limit = docCount > docToEntry.length ? docToEntry.length
          : docCount;
      assert toEntry.length >= limit -1;
      if (addresses != null) {
        for (int i = 0; i < limit; i++) {
          assert addresses[toEntry[i]] >= 0;
          w.add(addresses[toEntry[i]]);
        }
      } else {
        for (int i = 0; i < limit; i++) {
          assert toEntry[i] >= 0;
          w.add(toEntry[i]);
        }
      }
      for (int i = limit; i < docCount; i++) {
        w.add(0);
      }
      w.finish();
    }
    
  }
  
  static abstract class BytesSortedSourceBase extends SortedSource {
    private final PagedBytes pagedBytes;
    
    protected final PackedInts.Reader docToOrdIndex;
    protected final PackedInts.Reader ordToOffsetIndex;

    protected final IndexInput datIn;
    protected final IndexInput idxIn;
    protected final BytesRef defaultValue = new BytesRef();
    protected final static int PAGED_BYTES_BITS = 15;
    protected final PagedBytes.Reader data;

    protected BytesSortedSourceBase(IndexInput datIn, IndexInput idxIn,
        Comparator<BytesRef> comp, long bytesToRead, Type type, boolean hasOffsets) throws IOException {
      this(datIn, idxIn, comp, new PagedBytes(PAGED_BYTES_BITS), bytesToRead, type, hasOffsets);
    }
    
    protected BytesSortedSourceBase(IndexInput datIn, IndexInput idxIn,
        Comparator<BytesRef> comp, PagedBytes pagedBytes, long bytesToRead, Type type, boolean hasOffsets)
        throws IOException {
      super(type, comp);
      assert bytesToRead <= datIn.length() : " file size is less than the expected size diff: "
          + (bytesToRead - datIn.length()) + " pos: " + datIn.getFilePointer();
      this.datIn = datIn;
      this.pagedBytes = pagedBytes;
      this.pagedBytes.copy(datIn, bytesToRead);
      data = pagedBytes.freeze(true);
      this.idxIn = idxIn;
      ordToOffsetIndex = hasOffsets ? PackedInts.getReader(idxIn) : null; 
      docToOrdIndex = PackedInts.getReader(idxIn);
    }

    @Override
    public boolean hasPackedDocToOrd() {
      return true;
    }

    @Override
    public PackedInts.Reader getDocToOrd() {
      return docToOrdIndex;
    }
    
    @Override
    public int ord(int docID) {
      assert docToOrdIndex.get(docID) < getValueCount();
      return (int) docToOrdIndex.get(docID);
    }

    protected void closeIndexInput() throws IOException {
      IOUtils.close(datIn, idxIn);
    }
  }
}