Lucene45DocValuesConsumer.java example

Explorer
heliosearch-master
- lucene
- solr
package org.apache.lucene.codecs.lucene45;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.Closeable; // javadocs
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.packed.BlockPackedWriter;
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
import org.apache.lucene.util.packed.PackedInts;

/** writer for {@link Lucene45DocValuesFormat} */
public class Lucene45DocValuesConsumer extends DocValuesConsumer implements Closeable {

  static final int BLOCK_SIZE = 16384;
  static final int ADDRESS_INTERVAL = 16;
  static final Number MISSING_ORD = Long.valueOf(-1);

  /** Compressed using packed blocks of ints. */
  public static final int DELTA_COMPRESSED = 0;
  /** Compressed by computing the GCD. */
  public static final int GCD_COMPRESSED = 1;
  /** Compressed by giving IDs to unique values. */
  public static final int TABLE_COMPRESSED = 2;
  
  /** Uncompressed binary, written directly (fixed length). */
  public static final int BINARY_FIXED_UNCOMPRESSED = 0;
  /** Uncompressed binary, written directly (variable length). */
  public static final int BINARY_VARIABLE_UNCOMPRESSED = 1;
  /** Compressed binary with shared prefixes */
  public static final int BINARY_PREFIX_COMPRESSED = 2;

  /** Standard storage for sorted set values with 1 level of indirection:
   *  docId -> address -> ord. */
  public static final int SORTED_SET_WITH_ADDRESSES = 0;
  /** Single-valued sorted set values, encoded as sorted values, so no level
   *  of indirection: docId -> ord. */
  public static final int SORTED_SET_SINGLE_VALUED_SORTED = 1;

  final IndexOutput data, meta;
  final int maxDoc;
  
  /** expert: Creates a new writer */
  public Lucene45DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
    boolean success = false;
    try {
      String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
      data = state.directory.createOutput(dataName, state.context);
      CodecUtil.writeHeader(data, dataCodec, Lucene45DocValuesFormat.VERSION_CURRENT);
      String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
      meta = state.directory.createOutput(metaName, state.context);
      CodecUtil.writeHeader(meta, metaCodec, Lucene45DocValuesFormat.VERSION_CURRENT);
      maxDoc = state.segmentInfo.getDocCount();
      success = true;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(this);
      }
    }
  }
  
  @Override
  public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
    addNumericField(field, values, true);
  }

  void addNumericField(FieldInfo field, Iterable<Number> values, boolean optimizeStorage) throws IOException {
    long count = 0;
    long minValue = Long.MAX_VALUE;
    long maxValue = Long.MIN_VALUE;
    long gcd = 0;
    boolean missing = false;
    // TODO: more efficient?
    HashSet<Long> uniqueValues = null;
    if (optimizeStorage) {
      uniqueValues = new HashSet<>();

      for (Number nv : values) {
        final long v;
        if (nv == null) {
          v = 0;
          missing = true;
        } else {
          v = nv.longValue();
        }

        if (gcd != 1) {
          if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) {
            // in that case v - minValue might overflow and make the GCD computation return
            // wrong results. Since these extreme values are unlikely, we just discard
            // GCD computation for them
            gcd = 1;
          } else if (count != 0) { // minValue needs to be set first
            gcd = MathUtil.gcd(gcd, v - minValue);
          }
        }

        minValue = Math.min(minValue, v);
        maxValue = Math.max(maxValue, v);

        if (uniqueValues != null) {
          if (uniqueValues.add(v)) {
            if (uniqueValues.size() > 256) {
              uniqueValues = null;
            }
          }
        }

        ++count;
      }
    } else {
      for (@SuppressWarnings("unused") Number nv : values) {
        ++count;
      }
    }
    
    final long delta = maxValue - minValue;

    final int format;
    if (uniqueValues != null
        && (delta < 0L || PackedInts.bitsRequired(uniqueValues.size() - 1) < PackedInts.bitsRequired(delta))
        && count <= Integer.MAX_VALUE) {
      format = TABLE_COMPRESSED;
    } else if (gcd != 0 && gcd != 1) {
      format = GCD_COMPRESSED;
    } else {
      format = DELTA_COMPRESSED;
    }
    meta.writeVInt(field.number);
    meta.writeByte(Lucene45DocValuesFormat.NUMERIC);
    meta.writeVInt(format);
    if (missing) {
      meta.writeLong(data.getFilePointer());
      writeMissingBitset(values);
    } else {
      meta.writeLong(-1L);
    }
    meta.writeVInt(PackedInts.VERSION_CURRENT);
    meta.writeLong(data.getFilePointer());
    meta.writeVLong(count);
    meta.writeVInt(BLOCK_SIZE);

    switch (format) {
      case GCD_COMPRESSED:
        meta.writeLong(minValue);
        meta.writeLong(gcd);
        final BlockPackedWriter quotientWriter = new BlockPackedWriter(data, BLOCK_SIZE);
        for (Number nv : values) {
          long value = nv == null ? 0 : nv.longValue();
          quotientWriter.add((value - minValue) / gcd);
        }
        quotientWriter.finish();
        break;
      case DELTA_COMPRESSED:
        final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
        for (Number nv : values) {
          writer.add(nv == null ? 0 : nv.longValue());
        }
        writer.finish();
        break;
      case TABLE_COMPRESSED:
        final Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
        final HashMap<Long,Integer> encode = new HashMap<>();
        meta.writeVInt(decode.length);
        for (int i = 0; i < decode.length; i++) {
          meta.writeLong(decode[i]);
          encode.put(decode[i], i);
        }
        final int bitsRequired = PackedInts.bitsRequired(uniqueValues.size() - 1);
        final PackedInts.Writer ordsWriter = PackedInts.getWriterNoHeader(data, PackedInts.Format.PACKED, (int) count, bitsRequired, PackedInts.DEFAULT_BUFFER_SIZE);
        for (Number nv : values) {
          ordsWriter.add(encode.get(nv == null ? 0 : nv.longValue()));
        }
        ordsWriter.finish();
        break;
      default:
        throw new AssertionError();
    }
  }
  
  // TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on,
  // but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode)
  void writeMissingBitset(Iterable<?> values) throws IOException {
    byte bits = 0;
    int count = 0;
    for (Object v : values) {
      if (count == 8) {
        data.writeByte(bits);
        count = 0;
        bits = 0;
      }
      if (v != null) {
        bits |= 1 << (count & 7);
      }
      count++;
    }
    if (count > 0) {
      data.writeByte(bits);
    }
  }

  @Override
  public void addBinaryField(FieldInfo field, Iterable<BytesRef> values) throws IOException {
    // write the byte[] data
    meta.writeVInt(field.number);
    meta.writeByte(Lucene45DocValuesFormat.BINARY);
    int minLength = Integer.MAX_VALUE;
    int maxLength = Integer.MIN_VALUE;
    final long startFP = data.getFilePointer();
    long count = 0;
    boolean missing = false;
    for(BytesRef v : values) {
      final int length;
      if (v == null) {
        length = 0;
        missing = true;
      } else {
        length = v.length;
      }
      minLength = Math.min(minLength, length);
      maxLength = Math.max(maxLength, length);
      if (v != null) {
        data.writeBytes(v.bytes, v.offset, v.length);
      }
      count++;
    }
    meta.writeVInt(minLength == maxLength ? BINARY_FIXED_UNCOMPRESSED : BINARY_VARIABLE_UNCOMPRESSED);
    if (missing) {
      meta.writeLong(data.getFilePointer());
      writeMissingBitset(values);
    } else {
      meta.writeLong(-1L);
    }
    meta.writeVInt(minLength);
    meta.writeVInt(maxLength);
    meta.writeVLong(count);
    meta.writeLong(startFP);
    
    // if minLength == maxLength, its a fixed-length byte[], we are done (the addresses are implicit)
    // otherwise, we need to record the length fields...
    if (minLength != maxLength) {
      meta.writeLong(data.getFilePointer());
      meta.writeVInt(PackedInts.VERSION_CURRENT);
      meta.writeVInt(BLOCK_SIZE);

      final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
      long addr = 0;
      for (BytesRef v : values) {
        if (v != null) {
          addr += v.length;
        }
        writer.add(addr);
      }
      writer.finish();
    }
  }
  
  /** expert: writes a value dictionary for a sorted/sortedset field */
  protected void addTermsDict(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
    // first check if its a "fixed-length" terms dict
    int minLength = Integer.MAX_VALUE;
    int maxLength = Integer.MIN_VALUE;
    for (BytesRef v : values) {
      minLength = Math.min(minLength, v.length);
      maxLength = Math.max(maxLength, v.length);
    }
    if (minLength == maxLength) {
      // no index needed: direct addressing by mult
      addBinaryField(field, values);
    } else {
      // header
      meta.writeVInt(field.number);
      meta.writeByte(Lucene45DocValuesFormat.BINARY);
      meta.writeVInt(BINARY_PREFIX_COMPRESSED);
      meta.writeLong(-1L);
      // now write the bytes: sharing prefixes within a block
      final long startFP = data.getFilePointer();
      // currently, we have to store the delta from expected for every 1/nth term
      // we could avoid this, but its not much and less overall RAM than the previous approach!
      RAMOutputStream addressBuffer = new RAMOutputStream();
      MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, BLOCK_SIZE);
      BytesRef lastTerm = new BytesRef();
      long count = 0;
      for (BytesRef v : values) {
        if (count % ADDRESS_INTERVAL == 0) {
          termAddresses.add(data.getFilePointer() - startFP);
          // force the first term in a block to be abs-encoded
          lastTerm.length = 0;
        }
        
        // prefix-code
        int sharedPrefix = StringHelper.bytesDifference(lastTerm, v);
        data.writeVInt(sharedPrefix);
        data.writeVInt(v.length - sharedPrefix);
        data.writeBytes(v.bytes, v.offset + sharedPrefix, v.length - sharedPrefix);
        lastTerm.copyBytes(v);
        count++;
      }
      final long indexStartFP = data.getFilePointer();
      // write addresses of indexed terms
      termAddresses.finish();
      addressBuffer.writeTo(data);
      addressBuffer = null;
      termAddresses = null;
      meta.writeVInt(minLength);
      meta.writeVInt(maxLength);
      meta.writeVLong(count);
      meta.writeLong(startFP);
      meta.writeVInt(ADDRESS_INTERVAL);
      meta.writeLong(indexStartFP);
      meta.writeVInt(PackedInts.VERSION_CURRENT);
      meta.writeVInt(BLOCK_SIZE);
    }
  }

  @Override
  public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
    meta.writeVInt(field.number);
    meta.writeByte(Lucene45DocValuesFormat.SORTED);
    addTermsDict(field, values);
    addNumericField(field, docToOrd, false);
  }

  private static boolean isSingleValued(Iterable<Number> docToOrdCount) {
    for (Number ordCount : docToOrdCount) {
      if (ordCount.longValue() > 1) {
        return false;
      }
    }
    return true;
  }

  @Override
  public void addSortedSetField(FieldInfo field, Iterable<BytesRef> values, final Iterable<Number> docToOrdCount, final Iterable<Number> ords) throws IOException {
    meta.writeVInt(field.number);
    meta.writeByte(Lucene45DocValuesFormat.SORTED_SET);

    if (isSingleValued(docToOrdCount)) {
      meta.writeVInt(SORTED_SET_SINGLE_VALUED_SORTED);
      // The field is single-valued, we can encode it as SORTED
      addSortedField(field, values, new Iterable<Number>() {

        @Override
        public Iterator<Number> iterator() {
          final Iterator<Number> docToOrdCountIt = docToOrdCount.iterator();
          final Iterator<Number> ordsIt = ords.iterator();
          return new Iterator<Number>() {

            @Override
            public boolean hasNext() {
              assert ordsIt.hasNext() ? docToOrdCountIt.hasNext() : true;
              return docToOrdCountIt.hasNext();
            }

            @Override
            public Number next() {
              final Number ordCount = docToOrdCountIt.next();
              if (ordCount.longValue() == 0) {
                return MISSING_ORD;
              } else {
                assert ordCount.longValue() == 1;
                return ordsIt.next();
              }
            }

            @Override
            public void remove() {
              throw new UnsupportedOperationException();
            }

          };
        }

      });
      return;
    }

    meta.writeVInt(SORTED_SET_WITH_ADDRESSES);

    // write the ord -> byte[] as a binary field
    addTermsDict(field, values);

    // write the stream of ords as a numeric field
    // NOTE: we could return an iterator that delta-encodes these within a doc
    addNumericField(field, ords, false);

    // write the doc -> ord count as a absolute index to the stream
    meta.writeVInt(field.number);
    meta.writeByte(Lucene45DocValuesFormat.NUMERIC);
    meta.writeVInt(DELTA_COMPRESSED);
    meta.writeLong(-1L);
    meta.writeVInt(PackedInts.VERSION_CURRENT);
    meta.writeLong(data.getFilePointer());
    meta.writeVLong(maxDoc);
    meta.writeVInt(BLOCK_SIZE);

    final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
    long addr = 0;
    for (Number v : docToOrdCount) {
      addr += v.longValue();
      writer.add(addr);
    }
    writer.finish();
  }

  @Override
  public void close() throws IOException {
    boolean success = false;
    try {
      if (meta != null) {
        meta.writeVInt(-1); // write EOF marker
      }
      success = true;
    } finally {
      if (success) {
        IOUtils.close(data, meta);
      } else {
        IOUtils.closeWhileHandlingException(data, meta);
      }
    }
  }
}