MemoryDocValuesConsumer.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.memory;


import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.NoSuchElementException;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.LegacyDocValuesIterables;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST.INPUT_TYPE;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.packed.BlockPackedWriter;
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
import org.apache.lucene.util.packed.PackedInts.FormatAndBits;
import org.apache.lucene.util.packed.PackedInts;

import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.BLOCK_COMPRESSED;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.BLOCK_SIZE;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.BYTES;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.DELTA_COMPRESSED;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.FST;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.GCD_COMPRESSED;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.NUMBER;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.SORTED_NUMERIC;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.SORTED_NUMERIC_SINGLETON;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.SORTED_SET;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.SORTED_SET_SINGLETON;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.TABLE_COMPRESSED;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.VERSION_CURRENT;

/**
 * Writer for {@link MemoryDocValuesFormat}
 */
class MemoryDocValuesConsumer extends DocValuesConsumer {
  IndexOutput data, meta;
  final int maxDoc;
  final float acceptableOverheadRatio;
  
  MemoryDocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension, float acceptableOverheadRatio) throws IOException {
    this.acceptableOverheadRatio = acceptableOverheadRatio;
    maxDoc = state.segmentInfo.maxDoc();
    boolean success = false;
    try {
      String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
      data = state.directory.createOutput(dataName, state.context);
      CodecUtil.writeIndexHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
      String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
      meta = state.directory.createOutput(metaName, state.context);
      CodecUtil.writeIndexHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
      success = true;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(this);
      }
    }
  }

  @Override
  public void addNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
    addNumericField(field, LegacyDocValuesIterables.numericIterable(field, valuesProducer, maxDoc), true);
  }

  void addNumericField(FieldInfo field, Iterable<Number> values, boolean optimizeStorage) throws IOException {
    meta.writeVInt(field.number);
    meta.writeByte(NUMBER);
    meta.writeLong(data.getFilePointer());
    long minValue = Long.MAX_VALUE;
    long maxValue = Long.MIN_VALUE;
    long blockSum = 0;
    long gcd = 0;
    boolean missing = false;
    // TODO: more efficient?
    HashSet<Long> uniqueValues = null;
    long count = 0;

    if (optimizeStorage) {
      uniqueValues = new HashSet<>();

      long currentBlockMin = Long.MAX_VALUE;
      long currentBlockMax = Long.MIN_VALUE;
      for (Number nv : values) {
        final long v;
        if (nv == null) {
          v = 0;
          missing = true;
        } else {
          v = nv.longValue();
        }

        if (gcd != 1) {
          if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) {
            // in that case v - minValue might overflow and make the GCD computation return
            // wrong results. Since these extreme values are unlikely, we just discard
            // GCD computation for them
            gcd = 1;
          } else if (count != 0) { // minValue needs to be set first
            gcd = MathUtil.gcd(gcd, v - minValue);
          }
        }

        currentBlockMin = Math.min(minValue, v);
        currentBlockMax = Math.max(maxValue, v);
        
        minValue = Math.min(minValue, v);
        maxValue = Math.max(maxValue, v);

        if (uniqueValues != null) {
          if (uniqueValues.add(v)) {
            if (uniqueValues.size() > 256) {
              uniqueValues = null;
            }
          }
        }

        ++count;
        if (count % BLOCK_SIZE == 0) {
          final long blockDelta = currentBlockMax - currentBlockMin;
          final int blockDeltaRequired = PackedInts.unsignedBitsRequired(blockDelta);
          final int blockBPV = PackedInts.fastestFormatAndBits(BLOCK_SIZE, blockDeltaRequired, acceptableOverheadRatio).bitsPerValue;
          blockSum += blockBPV;
          currentBlockMax = Long.MIN_VALUE;
          currentBlockMin = Long.MAX_VALUE;
        }
      }
    } else {
      for (Number nv : values) {
        long v = nv.longValue();
        maxValue = Math.max(v, maxValue);
        minValue = Math.min(v, minValue);
        count++;
      }
    }
    
    if (missing) {
      long start = data.getFilePointer();
      writeMissingBitset(values);
      meta.writeLong(start);
      meta.writeLong(data.getFilePointer() - start);
    } else {
      meta.writeLong(-1L);
    }
    
    final long delta = maxValue - minValue;
    final int deltaRequired = delta < 0 ? 64 : PackedInts.bitsRequired(delta);
    final FormatAndBits deltaBPV = PackedInts.fastestFormatAndBits(maxDoc, deltaRequired, acceptableOverheadRatio);
        
    final FormatAndBits tableBPV;
    if (count < Integer.MAX_VALUE && uniqueValues != null) {
      tableBPV = PackedInts.fastestFormatAndBits(maxDoc, PackedInts.bitsRequired(uniqueValues.size()-1), acceptableOverheadRatio);
    } else {
      tableBPV = null;
    }
    
    final FormatAndBits gcdBPV;
    if (count < Integer.MAX_VALUE && gcd != 0 && gcd != 1) {
      final long gcdDelta = (maxValue - minValue) / gcd;
      final int gcdRequired = gcdDelta < 0 ? 64 : PackedInts.bitsRequired(gcdDelta);
      gcdBPV = PackedInts.fastestFormatAndBits(maxDoc, gcdRequired, acceptableOverheadRatio);
    } else {
      gcdBPV = null;
    }
    
    boolean doBlock = false;
    if (blockSum != 0) {
      int numBlocks = maxDoc / BLOCK_SIZE;
      float avgBPV = blockSum / (float)numBlocks;
      // just a heuristic, with tiny amounts of blocks our estimate is skewed as we ignore the final "incomplete" block.
      // with at least 4 blocks it's pretty accurate. The difference must also be significant (according to acceptable overhead).
      if (numBlocks >= 4 && (avgBPV+avgBPV*acceptableOverheadRatio) < deltaBPV.bitsPerValue) {
        doBlock = true;
      }
    }
    // blockpackedreader allows us to read in huge streams of ints
    if (count >= Integer.MAX_VALUE) {
      doBlock = true;
    }
    
    if (tableBPV != null && (tableBPV.bitsPerValue+tableBPV.bitsPerValue*acceptableOverheadRatio) < deltaBPV.bitsPerValue) {
      // small number of unique values
      meta.writeByte(TABLE_COMPRESSED); // table-compressed
      Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
      final HashMap<Long,Integer> encode = new HashMap<>();
      int length = 1 << tableBPV.bitsPerValue;
      data.writeVInt(length);
      for (int i = 0; i < decode.length; i++) {
        data.writeLong(decode[i]);
        encode.put(decode[i], i);
      }
      for (int i = decode.length; i < length; i++) {
        data.writeLong(0);
      }
      
      meta.writeVInt(PackedInts.VERSION_CURRENT);
      meta.writeLong(count);
      data.writeVInt(tableBPV.format.getId());
      data.writeVInt(tableBPV.bitsPerValue);
      
      final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, tableBPV.format, (int)count, tableBPV.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
      for(Number nv : values) {
        writer.add(encode.get(nv == null ? 0 : nv.longValue()));
      }
      writer.finish();
    } else if (gcdBPV != null && (gcdBPV.bitsPerValue+gcdBPV.bitsPerValue*acceptableOverheadRatio) < deltaBPV.bitsPerValue) {
      meta.writeByte(GCD_COMPRESSED);
      meta.writeVInt(PackedInts.VERSION_CURRENT);
      meta.writeLong(count);
      data.writeLong(minValue);
      data.writeLong(gcd);
      data.writeVInt(gcdBPV.format.getId());
      data.writeVInt(gcdBPV.bitsPerValue);

      final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, gcdBPV.format, (int)count, gcdBPV.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
      for (Number nv : values) {
        long value = nv == null ? 0 : nv.longValue();
        writer.add((value - minValue) / gcd);
      }
      writer.finish();
    } else if (doBlock) {
      meta.writeByte(BLOCK_COMPRESSED); // block delta-compressed
      meta.writeVInt(PackedInts.VERSION_CURRENT);
      meta.writeLong(count);
      data.writeVInt(BLOCK_SIZE);
      final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
      for (Number nv : values) {
        writer.add(nv == null ? 0 : nv.longValue());
      }
      writer.finish();
    } else {
      meta.writeByte(DELTA_COMPRESSED); // delta-compressed
      meta.writeVInt(PackedInts.VERSION_CURRENT);
      meta.writeLong(count);
      final long minDelta = deltaBPV.bitsPerValue == 64 ? 0 : minValue;
      data.writeLong(minDelta);
      data.writeVInt(deltaBPV.format.getId());
      data.writeVInt(deltaBPV.bitsPerValue);

      final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, deltaBPV.format, (int)count, deltaBPV.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
      for (Number nv : values) {
        long v = nv == null ? 0 : nv.longValue();
        writer.add(v - minDelta);
      }
      writer.finish();
    }
  }
  
  @Override
  public void close() throws IOException {
    boolean success = false;
    try {
      if (meta != null) {
        meta.writeVInt(-1); // write EOF marker
        CodecUtil.writeFooter(meta); // write checksum
      }
      if (data != null) {
        CodecUtil.writeFooter(data);
      }
      success = true;
    } finally {
      if (success) {
        IOUtils.close(data, meta);
      } else {
        IOUtils.closeWhileHandlingException(data, meta);
      }
      data = meta = null;
    }
  }

  @Override
  public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
    addBinaryField(field, LegacyDocValuesIterables.binaryIterable(field, valuesProducer, maxDoc));
  }

  private void addBinaryField(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
    // write the byte[] data
    meta.writeVInt(field.number);
    meta.writeByte(BYTES);
    int minLength = Integer.MAX_VALUE;
    int maxLength = Integer.MIN_VALUE;
    final long startFP = data.getFilePointer();
    boolean missing = false;
    int upto = 0;
    for(BytesRef v : values) {
      final int length;
      if (v == null) {
        length = 0;
        missing = true;
      } else {
        length = v.length;
      }
      if (length > MemoryDocValuesFormat.MAX_BINARY_FIELD_LENGTH) {
        throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + MemoryDocValuesFormat.MAX_BINARY_FIELD_LENGTH + " but got length=" + length + " v=" + v + "; upto=" + upto + " values=" + values);
      }
      upto++;
      minLength = Math.min(minLength, length);
      maxLength = Math.max(maxLength, length);
      if (v != null) {
        data.writeBytes(v.bytes, v.offset, v.length);
      }
    }
    meta.writeLong(startFP);
    meta.writeLong(data.getFilePointer() - startFP);
    if (missing) {
      long start = data.getFilePointer();
      writeMissingBitset(values);
      meta.writeLong(start);
      meta.writeLong(data.getFilePointer() - start);
    } else {
      meta.writeLong(-1L);
    }
    meta.writeVInt(minLength);
    meta.writeVInt(maxLength);
    
    // if minLength == maxLength, it's a fixed-length byte[], we are done (the addresses are implicit)
    // otherwise, we need to record the length fields...
    if (minLength != maxLength) {
      meta.writeVInt(PackedInts.VERSION_CURRENT);
      meta.writeVInt(BLOCK_SIZE);

      final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
      long addr = 0;
      for (BytesRef v : values) {
        if (v != null) {
          addr += v.length;
        }
        writer.add(addr);
      }
      writer.finish();
    }
  }
  
  private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException {
    meta.writeVInt(field.number);
    meta.writeByte(FST);
    meta.writeLong(data.getFilePointer());
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> builder = new Builder<>(INPUT_TYPE.BYTE1, outputs);
    IntsRefBuilder scratch = new IntsRefBuilder();
    long ord = 0;
    for (BytesRef v : values) {
      builder.add(Util.toIntsRef(v, scratch), ord);
      ord++;
    }
    FST<Long> fst = builder.finish();
    if (fst != null) {
      fst.save(data);
    }
    meta.writeVLong(ord);
  }
  
  // TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on,
  // but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode)
  void writeMissingBitset(Iterable<?> values) throws IOException {
    long bits = 0;
    int count = 0;
    for (Object v : values) {
      if (count == 64) {
        data.writeLong(bits);
        count = 0;
        bits = 0;
      }
      if (v != null) {
        bits |= 1L << (count & 0x3f);
      }
      count++;
    }
    if (count > 0) {
      data.writeLong(bits);
    }
  }

  @Override
  public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
    addSortedField(field,
                   LegacyDocValuesIterables.valuesIterable(valuesProducer.getSorted(field)),
                   LegacyDocValuesIterables.sortedOrdIterable(valuesProducer, field, maxDoc));
  }
  
  private void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
    // write the ordinals as numerics
    addNumericField(field, docToOrd, false);
    
    // write the values as FST
    writeFST(field, values);
  }
  
  @Override
  public void addSortedNumericField(FieldInfo field, final DocValuesProducer valuesProducer) throws IOException {

    final Iterable<Number> docToValueCount = LegacyDocValuesIterables.sortedNumericToDocCount(valuesProducer, field, maxDoc);
    final Iterable<Number> values = LegacyDocValuesIterables.sortedNumericToValues(valuesProducer, field);

    meta.writeVInt(field.number);
    
    if (isSingleValued(docToValueCount)) {
      meta.writeByte(SORTED_NUMERIC_SINGLETON);
      addNumericField(field, singletonView(docToValueCount, values, null), true);
    } else {
      meta.writeByte(SORTED_NUMERIC);
      
      // write the addresses:
      meta.writeVInt(PackedInts.VERSION_CURRENT);
      meta.writeVInt(BLOCK_SIZE);
      meta.writeLong(data.getFilePointer());
      final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
      long addr = 0;
      writer.add(addr);
      for (Number v : docToValueCount) {
        addr += v.longValue();
        writer.add(addr);
      }
      writer.finish();
      long valueCount = writer.ord();
      meta.writeLong(valueCount);
      
      // write the values
      addNumericField(field, values, true);
    }
  }

  // note: this might not be the most efficient... but it's fairly simple
  @Override
  public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
    Iterable<BytesRef> values = LegacyDocValuesIterables.valuesIterable(valuesProducer.getSortedSet(field));
    Iterable<Number> docToOrdCount = LegacyDocValuesIterables.sortedSetOrdCountIterable(valuesProducer, field, maxDoc);
    Iterable<Number> ords = LegacyDocValuesIterables.sortedSetOrdsIterable(valuesProducer, field);
    meta.writeVInt(field.number);
    
    if (isSingleValued(docToOrdCount)) {
      meta.writeByte(SORTED_SET_SINGLETON);
      addSortedField(field, values, singletonView(docToOrdCount, ords, -1L));
    } else {
      meta.writeByte(SORTED_SET);
      // write the ordinals as a binary field
      addBinaryField(field, new Iterable<BytesRef>() {
        @Override
        public Iterator<BytesRef> iterator() {
          return new SortedSetIterator(docToOrdCount.iterator(), ords.iterator());
        }
      });
      
      // write the values as FST
      writeFST(field, values);
    }
  }
  
  // per-document vint-encoded byte[]
  static class SortedSetIterator implements Iterator<BytesRef> {
    byte[] buffer = new byte[10];
    ByteArrayDataOutput out = new ByteArrayDataOutput();
    BytesRef ref = new BytesRef();
    
    final Iterator<Number> counts;
    final Iterator<Number> ords;
    
    SortedSetIterator(Iterator<Number> counts, Iterator<Number> ords) {
      this.counts = counts;
      this.ords = ords;
    }
    
    @Override
    public boolean hasNext() {
      return counts.hasNext();
    }

    @Override
    public BytesRef next() {
      if (!hasNext()) {
        throw new NoSuchElementException();
      }
      
      int count = counts.next().intValue();
      int maxSize = count*9; // worst case
      if (maxSize > buffer.length) {
        buffer = ArrayUtil.grow(buffer, maxSize);
      }
      
      try {
        encodeValues(count);
      } catch (IOException bogus) {
        throw new RuntimeException(bogus);
      }
      
      ref.bytes = buffer;
      ref.offset = 0;
      ref.length = out.getPosition();

      return ref;
    }
    
    // encodes count values to buffer
    private void encodeValues(int count) throws IOException {
      out.reset(buffer);
      long lastOrd = 0;
      for (int i = 0; i < count; i++) {
        long ord = ords.next().longValue();
        out.writeVLong(ord - lastOrd);
        lastOrd = ord;
      }
    }

    @Override
    public void remove() {
      throw new UnsupportedOperationException();
    }
  }
}