NumericUtils.java example

Explorer
solrcene-master
package org.apache.lucene.util;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.analysis.NumericTokenStream; // for javadocs
import org.apache.lucene.document.NumericField; // for javadocs
import org.apache.lucene.search.NumericRangeQuery; // for javadocs
import org.apache.lucene.search.NumericRangeFilter; // for javadocs

// TODO: Remove the commented out methods before release!

/**
 * This is a helper class to generate prefix-encoded representations for numerical values
 * and supplies converters to represent float/double values as sortable integers/longs.
 *
 * <p>To quickly execute range queries in Apache Lucene, a range is divided recursively
 * into multiple intervals for searching: The center of the range is searched only with
 * the lowest possible precision in the trie, while the boundaries are matched
 * more exactly. This reduces the number of terms dramatically.
 *
 * <p>This class generates terms to achieve this: First the numerical integer values need to
 * be converted to bytes. For that integer values (32 bit or 64 bit) are made unsigned
 * and the bits are converted to ASCII chars with each 7 bit. The resulting byte[] is
 * sortable like the original integer value (even using UTF-8 sort order). Each value is also
 * prefixed (in the first char) by the <code>shift</code> value (number of bits removed) used
 * during encoding.
 *
 * <p>To also index floating point numbers, this class supplies two methods to convert them
 * to integer values by changing their bit layout: {@link #doubleToSortableLong},
 * {@link #floatToSortableInt}. You will have no precision loss by
 * converting floating point numbers to integers and back (only that the integer form
 * is not usable). Other data types like dates can easily converted to longs or ints (e.g.
 * date to long: {@link java.util.Date#getTime}).
 *
 * <p>For easy usage, the trie algorithm is implemented for indexing inside
 * {@link NumericTokenStream} that can index <code>int</code>, <code>long</code>,
 * <code>float</code>, and <code>double</code>. For querying,
 * {@link NumericRangeQuery} and {@link NumericRangeFilter} implement the query part
 * for the same data types.
 *
 * <p>This class can also be used, to generate lexicographically sortable (according to
 * {@link BytesRef#getUTF8SortedAsUTF16Comparator()}) representations of numeric data
 * types for other usages (e.g. sorting).
 *
 * @lucene.internal
 * @since 2.9, API changed non backwards-compliant in 4.0
 */
public final class NumericUtils {

  private NumericUtils() {} // no instance!
  
  /**
   * The default precision step used by {@link NumericField}, {@link NumericTokenStream},
   * {@link NumericRangeQuery}, and {@link NumericRangeFilter} as default
   */
  public static final int PRECISION_STEP_DEFAULT = 4;
  
  /**
   * Longs are stored at lower precision by shifting off lower bits. The shift count is
   * stored as <code>SHIFT_START_LONG+shift</code> in the first byte
   */
  public static final byte SHIFT_START_LONG = 0x20;

  /**
   * The maximum term length (used for <code>byte[]</code> buffer size)
   * for encoding <code>long</code> values.
   * @see #longToPrefixCoded(long,int,BytesRef)
   */
  public static final int BUF_SIZE_LONG = 63/7 + 2;

  /**
   * Integers are stored at lower precision by shifting off lower bits. The shift count is
   * stored as <code>SHIFT_START_INT+shift</code> in the first byte
   */
  public static final byte SHIFT_START_INT  = 0x60;

  /**
   * The maximum term length (used for <code>byte[]</code> buffer size)
   * for encoding <code>int</code> values.
   * @see #intToPrefixCoded(int,int,BytesRef)
   */
  public static final int BUF_SIZE_INT = 31/7 + 2;

  /**
   * Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
   * This is method is used by {@link NumericTokenStream}.
   * After encoding, {@code bytes.offset} will always be 0. 
   * @param val the numeric value
   * @param shift how many bits to strip from the right
   * @param bytes will contain the encoded value
   * @return the hash code for indexing (TermsHash)
   */
  public static int longToPrefixCoded(final long val, final int shift, final BytesRef bytes) {
    if (shift>63 || shift<0)
      throw new IllegalArgumentException("Illegal shift value, must be 0..63");
    int hash, nChars = (63-shift)/7 + 1;
    bytes.offset = 0;
    bytes.length = nChars+1;
    if (bytes.bytes.length < bytes.length) {
      bytes.grow(NumericUtils.BUF_SIZE_LONG);
    }
    bytes.bytes[0] = (byte) (hash = (SHIFT_START_LONG + shift));
    long sortableBits = val ^ 0x8000000000000000L;
    sortableBits >>>= shift;
    while (nChars > 0) {
      // Store 7 bits per byte for compatibility
      // with UTF-8 encoding of terms
      bytes.bytes[nChars--] = (byte)(sortableBits & 0x7f);
      sortableBits >>>= 7;
    }
    // calculate hash
    for (int i = 1; i < bytes.length; i++) {
      hash = 31*hash + bytes.bytes[i];
    }
    return hash;
  }

  /*
   * Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
   * This is method is used by {@link LongRangeBuilder}.
   * @param val the numeric value
   * @param shift how many bits to strip from the right
   * @deprecated This method is no longer needed!
   *
  @Deprecated
  public static String longToPrefixCoded(final long val, final int shift) {
    final BytesRef buffer = new BytesRef(BUF_SIZE_LONG);
    longToPrefixCoded(val, shift, buffer);
    return buffer.utf8ToString();
  }*/

  /*
   * This is a convenience method, that returns prefix coded bits of a long without
   * reducing the precision. It can be used to store the full precision value as a
   * stored field in index.
   * <p>To decode, use {@link #prefixCodedToLong}.
   * @deprecated This method is no longer needed!
   *
  @Deprecated
  public static String longToPrefixCoded(final long val) {
    return longToPrefixCoded(val, 0);
  }*/
  
  /**
   * Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
   * This is method is used by {@link NumericTokenStream}.
   * After encoding, {@code bytes.offset} will always be 0. 
   * @param val the numeric value
   * @param shift how many bits to strip from the right
   * @param bytes will contain the encoded value
   * @return the hash code for indexing (TermsHash)
   */
  public static int intToPrefixCoded(final int val, final int shift, final BytesRef bytes) {
    if (shift>31 || shift<0)
      throw new IllegalArgumentException("Illegal shift value, must be 0..31");
    int hash, nChars = (31-shift)/7 + 1;
    bytes.offset = 0;
    bytes.length = nChars+1;
    if (bytes.bytes.length < bytes.length) {
      bytes.grow(NumericUtils.BUF_SIZE_INT);
    }
    bytes.bytes[0] = (byte) (hash = (SHIFT_START_INT + shift));
    int sortableBits = val ^ 0x80000000;
    sortableBits >>>= shift;
    while (nChars > 0) {
      // Store 7 bits per byte for compatibility
      // with UTF-8 encoding of terms
      bytes.bytes[nChars--] = (byte)(sortableBits & 0x7f);
      sortableBits >>>= 7;
    }
    // calculate hash
    for (int i = 1; i < bytes.length; i++) {
      hash = 31*hash + bytes.bytes[i];
    }
    return hash;
  }

  /*
   * Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
   * This is method is used by {@link IntRangeBuilder}.
   * @param val the numeric value
   * @param shift how many bits to strip from the right
   * @deprecated This method is no longer needed!
   *
  @Deprecated
  public static String intToPrefixCoded(final int val, final int shift) {
    final BytesRef buffer = new BytesRef(BUF_SIZE_INT);
    intToPrefixCoded(val, shift, buffer);
    return buffer.utf8ToString();
  }*/

  /*
   * This is a convenience method, that returns prefix coded bits of an int without
   * reducing the precision. It can be used to store the full precision value as a
   * stored field in index.
   * <p>To decode, use {@link #prefixCodedToInt}.
   * @deprecated This method is no longer needed!
   *
  @Deprecated
  public static String intToPrefixCoded(final int val) {
    return intToPrefixCoded(val, 0);
  }*/

  /*
   * Returns a long from prefixCoded characters.
   * Rightmost bits will be zero for lower precision codes.
   * This method can be used to decode e.g. a stored field.
   * @throws NumberFormatException if the supplied string is
   * not correctly prefix encoded.
   * @see #longToPrefixCoded(long)
   * @deprecated This method is no longer needed!
   *
  @Deprecated
  public static long prefixCodedToLong(final String prefixCoded) {
    return prefixCodedToLong(new BytesRef(prefixCoded));
  }*/

  /**
   * Returns the shift value from a prefix encoded {@code long}.
   * @throws NumberFormatException if the supplied {@link BytesRef} is
   * not correctly prefix encoded.
   */
  public static int getPrefixCodedLongShift(final BytesRef val) {
    final int shift = val.bytes[val.offset] - SHIFT_START_LONG;
    if (shift > 63 || shift < 0)
      throw new NumberFormatException("Invalid shift value in prefixCoded bytes (is encoded value really an INT?)");
    return shift;
  }

  /**
   * Returns the shift value from a prefix encoded {@code int}.
   * @throws NumberFormatException if the supplied {@link BytesRef} is
   * not correctly prefix encoded.
   */
  public static int getPrefixCodedIntShift(final BytesRef val) {
    final int shift = val.bytes[val.offset] - SHIFT_START_INT;
    if (shift > 31 || shift < 0)
      throw new NumberFormatException("Invalid shift value in prefixCoded bytes (is encoded value really an INT?)");
    return shift;
  }

  /**
   * Returns a long from prefixCoded bytes.
   * Rightmost bits will be zero for lower precision codes.
   * This method can be used to decode a term's value.
   * @throws NumberFormatException if the supplied {@link BytesRef} is
   * not correctly prefix encoded.
   * @see #longToPrefixCoded(long,int,BytesRef)
   */
  public static long prefixCodedToLong(final BytesRef val) {
    long sortableBits = 0L;
    for (int i=val.offset+1, limit=val.offset+val.length; i<limit; i++) {
      sortableBits <<= 7;
      final byte b = val.bytes[i];
      if (b < 0) {
        throw new NumberFormatException(
          "Invalid prefixCoded numerical value representation (byte "+
          Integer.toHexString(b&0xff)+" at position "+(i-val.offset)+" is invalid)"
        );
      }
      sortableBits |= b;
    }
    return (sortableBits << getPrefixCodedLongShift(val)) ^ 0x8000000000000000L;
  }

  /*
   * Returns an int from prefixCoded characters.
   * Rightmost bits will be zero for lower precision codes.
   * This method can be used to decode a term's value.
   * @throws NumberFormatException if the supplied string is
   * not correctly prefix encoded.
   * @see #intToPrefixCoded(int)
   * @deprecated This method is no longer needed!
   *
  @Deprecated
  public static int prefixCodedToInt(final String prefixCoded) {
    return prefixCodedToInt(new BytesRef(prefixCoded));
  }*/

  /*
   * Returns an int from prefixCoded bytes.
   * Rightmost bits will be zero for lower precision codes.
   * This method can be used to decode a term's value.
   * @throws NumberFormatException if the supplied {@link BytesRef} is
   * not correctly prefix encoded.
   * @see #intToPrefixCoded(int,int,BytesRef)
   */
  public static int prefixCodedToInt(final BytesRef val) {
    int sortableBits = 0;
    for (int i=val.offset+1, limit=val.offset+val.length; i<limit; i++) {
      sortableBits <<= 7;
      final byte b = val.bytes[i];
      if (b < 0) {
        throw new NumberFormatException(
          "Invalid prefixCoded numerical value representation (byte "+
          Integer.toHexString(b&0xff)+" at position "+(i-val.offset)+" is invalid)"
        );
      }
      sortableBits |= b;
    }
    return (sortableBits << getPrefixCodedIntShift(val)) ^ 0x80000000;
  }

  /**
   * Converts a <code>double</code> value to a sortable signed <code>long</code>.
   * The value is converted by getting their IEEE 754 floating-point "double format"
   * bit layout and then some bits are swapped, to be able to compare the result as long.
   * By this the precision is not reduced, but the value can easily used as a long.
   * @see #sortableLongToDouble
   */
  public static long doubleToSortableLong(double val) {
    long f = Double.doubleToRawLongBits(val);
    if (f<0) f ^= 0x7fffffffffffffffL;
    return f;
  }

  /*
   * Convenience method: this just returns:
   *   longToPrefixCoded(doubleToSortableLong(val))
   * @deprecated This method is no longer needed!
   *
  @Deprecated
  public static String doubleToPrefixCoded(double val) {
    return longToPrefixCoded(doubleToSortableLong(val));
  }*/

  /**
   * Converts a sortable <code>long</code> back to a <code>double</code>.
   * @see #doubleToSortableLong
   */
  public static double sortableLongToDouble(long val) {
    if (val<0) val ^= 0x7fffffffffffffffL;
    return Double.longBitsToDouble(val);
  }

  /*
   * Convenience method: this just returns:
   *    sortableLongToDouble(prefixCodedToLong(val))
   * @deprecated This method is no longer needed!
   *
  @Deprecated
  public static double prefixCodedToDouble(String val) {
    return sortableLongToDouble(prefixCodedToLong(val));
  }*/

  /**
   * Converts a <code>float</code> value to a sortable signed <code>int</code>.
   * The value is converted by getting their IEEE 754 floating-point "float format"
   * bit layout and then some bits are swapped, to be able to compare the result as int.
   * By this the precision is not reduced, but the value can easily used as an int.
   * @see #sortableIntToFloat
   */
  public static int floatToSortableInt(float val) {
    int f = Float.floatToRawIntBits(val);
    if (f<0) f ^= 0x7fffffff;
    return f;
  }

  /*
   * Convenience method: this just returns:
   *   intToPrefixCoded(floatToSortableInt(val))
   * @deprecated This method is no longer needed!
   *
  @Deprecated
  public static String floatToPrefixCoded(float val) {
    return intToPrefixCoded(floatToSortableInt(val));
  }*/

  /**
   * Converts a sortable <code>int</code> back to a <code>float</code>.
   * @see #floatToSortableInt
   */
  public static float sortableIntToFloat(int val) {
    if (val<0) val ^= 0x7fffffff;
    return Float.intBitsToFloat(val);
  }

  /*
   * Convenience method: this just returns:
   *    sortableIntToFloat(prefixCodedToInt(val))
   * @deprecated This method is no longer needed!
   *
  @Deprecated
  public static float prefixCodedToFloat(String val) {
    return sortableIntToFloat(prefixCodedToInt(val));
  }*/

  /**
   * Splits a long range recursively.
   * You may implement a builder that adds clauses to a
   * {@link org.apache.lucene.search.BooleanQuery} for each call to its
   * {@link LongRangeBuilder#addRange(BytesRef,BytesRef)}
   * method.
   * <p>This method is used by {@link NumericRangeQuery}.
   */
  public static void splitLongRange(final LongRangeBuilder builder,
    final int precisionStep,  final long minBound, final long maxBound
  ) {
    splitRange(builder, 64, precisionStep, minBound, maxBound);
  }
  
  /**
   * Splits an int range recursively.
   * You may implement a builder that adds clauses to a
   * {@link org.apache.lucene.search.BooleanQuery} for each call to its
   * {@link IntRangeBuilder#addRange(BytesRef,BytesRef)}
   * method.
   * <p>This method is used by {@link NumericRangeQuery}.
   */
  public static void splitIntRange(final IntRangeBuilder builder,
    final int precisionStep,  final int minBound, final int maxBound
  ) {
    splitRange(builder, 32, precisionStep, minBound, maxBound);
  }
  
  /** This helper does the splitting for both 32 and 64 bit. */
  private static void splitRange(
    final Object builder, final int valSize,
    final int precisionStep, long minBound, long maxBound
  ) {
    if (precisionStep < 1)
      throw new IllegalArgumentException("precisionStep must be >=1");
    if (minBound > maxBound) return;
    for (int shift=0; ; shift += precisionStep) {
      // calculate new bounds for inner precision
      final long diff = 1L << (shift+precisionStep),
        mask = ((1L<<precisionStep) - 1L) << shift;
      final boolean
        hasLower = (minBound & mask) != 0L,
        hasUpper = (maxBound & mask) != mask;
      final long
        nextMinBound = (hasLower ? (minBound + diff) : minBound) & ~mask,
        nextMaxBound = (hasUpper ? (maxBound - diff) : maxBound) & ~mask;
      final boolean
        lowerWrapped = nextMinBound < minBound,
        upperWrapped = nextMaxBound > maxBound;
      
      if (shift+precisionStep>=valSize || nextMinBound>nextMaxBound || lowerWrapped || upperWrapped) {
        // We are in the lowest precision or the next precision is not available.
        addRange(builder, valSize, minBound, maxBound, shift);
        // exit the split recursion loop
        break;
      }
      
      if (hasLower)
        addRange(builder, valSize, minBound, minBound | mask, shift);
      if (hasUpper)
        addRange(builder, valSize, maxBound & ~mask, maxBound, shift);
      
      // recurse to next precision
      minBound = nextMinBound;
      maxBound = nextMaxBound;
    }
  }
  
  /** Helper that delegates to correct range builder */
  private static void addRange(
    final Object builder, final int valSize,
    long minBound, long maxBound,
    final int shift
  ) {
    // for the max bound set all lower bits (that were shifted away):
    // this is important for testing or other usages of the splitted range
    // (e.g. to reconstruct the full range). The prefixEncoding will remove
    // the bits anyway, so they do not hurt!
    maxBound |= (1L << shift) - 1L;
    // delegate to correct range builder
    switch(valSize) {
      case 64:
        ((LongRangeBuilder)builder).addRange(minBound, maxBound, shift);
        break;
      case 32:
        ((IntRangeBuilder)builder).addRange((int)minBound, (int)maxBound, shift);
        break;
      default:
        // Should not happen!
        throw new IllegalArgumentException("valSize must be 32 or 64.");
    }
  }

  /**
   * Callback for {@link #splitLongRange}.
   * You need to overwrite only one of the methods.
   * @lucene.internal
   * @since 2.9, API changed non backwards-compliant in 4.0
   */
  public static abstract class LongRangeBuilder {
    
    /**
     * Overwrite this method, if you like to receive the already prefix encoded range bounds.
     * You can directly build classical (inclusive) range queries from them.
     */
    public void addRange(BytesRef minPrefixCoded, BytesRef maxPrefixCoded) {
      throw new UnsupportedOperationException();
    }
    
    /**
     * Overwrite this method, if you like to receive the raw long range bounds.
     * You can use this for e.g. debugging purposes (print out range bounds).
     */
    public void addRange(final long min, final long max, final int shift) {
      final BytesRef minBytes = new BytesRef(BUF_SIZE_LONG), maxBytes = new BytesRef(BUF_SIZE_LONG);
      longToPrefixCoded(min, shift, minBytes);
      longToPrefixCoded(max, shift, maxBytes);
      addRange(minBytes, maxBytes);
    }
  
  }
  
  /**
   * Callback for {@link #splitIntRange}.
   * You need to overwrite only one of the methods.
   * @lucene.internal
   * @since 2.9, API changed non backwards-compliant in 4.0
   */
  public static abstract class IntRangeBuilder {
    
    /**
     * Overwrite this method, if you like to receive the already prefix encoded range bounds.
     * You can directly build classical range (inclusive) queries from them.
     */
    public void addRange(BytesRef minPrefixCoded, BytesRef maxPrefixCoded) {
      throw new UnsupportedOperationException();
    }
    
    /**
     * Overwrite this method, if you like to receive the raw int range bounds.
     * You can use this for e.g. debugging purposes (print out range bounds).
     */
    public void addRange(final int min, final int max, final int shift) {
      final BytesRef minBytes = new BytesRef(BUF_SIZE_INT), maxBytes = new BytesRef(BUF_SIZE_INT);
      intToPrefixCoded(min, shift, minBytes);
      intToPrefixCoded(max, shift, maxBytes);
      addRange(minBytes, maxBytes);
    }
  
  }
  
}