BCDUtils.java example

Explorer
solrcene-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.util;


/**
 * @version $Id: BCDUtils.java 555343 2007-07-11 17:46:25Z hossman $
 */
public class BCDUtils {
  // idiv is expensive...
  // use fixed point math to multiply by 1/10
  // http://www.cs.uiowa.edu/~jones/bcd/divide.html
  private static int div10(int a) { return (a * 0xcccd) >>> 19; }
  private static int mul10(int a) { return (a*10); }
  // private static int mul10(int a) { return ((a<<3)+(a<<1)); }
  // private static int mul10(int a) { return (a+(a<<2))<<1; } // attempt to use LEA instr
  // (imul32 on AMD64 only has a 3 cycle latency in any case)


  // something that won't clash with other base100int
// chars (something >= 100)
  private static final char NEG_CHAR=(char)126;
  // The zero exponent.
// NOTE: for smaller integer representations, this current implementation
// combines sign and exponent into the first char.  sign is negative if
// exponent is less than the zero point (no negative exponents themselves)
  private static final int ZERO_EXPONENT='a';  // 97

  // WARNING: assumption is that this is a legal int...
// no validation is done.  [+-]?digit*
//
// Normalization of zeros *is* done...
//  0004, 004, 04, 4 will all end up being equal
//  0,-0 are normalized to '' (zero length)
//
// The value is written to the output buffer
// from the end to the start.  The return value
// is the start of the Base100 int in the output buffer.
//
// As the output will be smaller than the input, arr and
// out may refer to the same array if desired.
//
  public static int base10toBase100(char[] arr, int start, int end,
                                    char[] out, int outend
                                    )
  {
    int wpos=outend;  // write position
    boolean neg=false;

    while (--end >= start) {
      int val = arr[end];
      if (val=='+') { break; }
      else if (val=='-') { neg=!neg; break; }
      else {
        val = val - '0';
        if (end > start) {
          int val2 = arr[end-1];
          if (val2=='+') { out[--wpos]=(char)val; break; }
          if (val2=='-') { out[--wpos]=(char)val; neg=!neg; break; }
          end--;
          val = val + (val2 - '0')*10;
        }
        out[--wpos] = (char)val;
      }
    }

    // remove leading base100 zeros
    while (wpos<outend && out[wpos]==0) wpos++;

    // check for a zero value
    if (wpos==outend) {
      // if zero, don't add negative sign
    } else if (neg) {
      out[--wpos]=NEG_CHAR;
    }

    return wpos;  // the start of the base100 int
  }

  // Converts a base100 number to base10 character form
// returns number of chars written.
// At least 1 char is always written.
  public static int base100toBase10(char[] arr, int start, int end,
                                    char[] out, int offset)
  {
    int wpos=offset;  // write position
    boolean firstDigit=true;
    for (int i=start; i<end; i++) {
      int val = arr[i];
      if (val== NEG_CHAR) { out[wpos++]='-'; continue; }
      char tens = (char)(val / 10 + '0');
      if (!firstDigit || tens!='0') {  // skip leading 0
        out[wpos++] = (char)(val / 10 + '0');    // tens position
      }
      out[wpos++] = (char)(val % 10 + '0');    // ones position
      firstDigit=false;
    }
    if (firstDigit) out[wpos++]='0';
    return wpos-offset;
  }

  public static String base10toBase100SortableInt(String val) {
    char[] arr = new char[val.length()+1];
    val.getChars(0,val.length(),arr,0);
    int len = base10toBase100SortableInt(arr,0,val.length(),arr,arr.length);
    return new String(arr,arr.length-len,len);
  }

  public static String base100SortableIntToBase10(String val) {
    int slen = val.length();
    char[] arr = new char[slen<<2];
    val.getChars(0,slen,arr,0);
    int len = base100SortableIntToBase10(arr,0,slen,arr,slen);
    return new String(arr,slen,len);
  }

  public static String base10toBase10kSortableInt(String val) {
    char[] arr = new char[val.length()+1];
    val.getChars(0,val.length(),arr,0);
    int len = base10toBase10kSortableInt(arr,0,val.length(),arr,arr.length);
    return new String(arr,arr.length-len,len);
  }

  public static String base10kSortableIntToBase10(String val) {
    int slen = val.length();
    char[] arr = new char[slen*5]; // +1 time for orig, +4 for new
    val.getChars(0,slen,arr,0);
    int len = base10kSortableIntToBase10(arr,0,slen,arr,slen);
    return new String(arr,slen,len);
  }

  /********* FUTURE
    // the zero exponent... exponents above this point are positive
    // and below are negative.
    // It is desirable to make ordinary numbers have a single byte
    // exponent when converted to UTF-8
    // For integers, the exponent will always be >=0, but this format
    // is meant to be valid for floating point numbers as well...
    private static final int ZERO_EXPONENT='a';  // 97

    // if exponent is larger than what can be represented
    // in a single byte (char), then this is the multibyte
    // escape char.
    // UCS-2 surrogates start at 0xD800
    private static final int POSITIVE_EXPONENT_ESCAPE=0x3fff;

    // if exponent is smaller than what can be represented in
    // a single byte, then this is the multibyte escape
    private static final int NEGATIVE_EXPONENT_ESCAPE=1;

    // if number is negative, it starts with this optional value
    // this should not overlap with any exponent values
    private static final int NEGATIVE_SIGN=0;
  **********/

    // WARNING: assumption is that this is a legal int...
    // no validation is done.  [+-]?digit*
    //
    // Normalization of zeros *is* done...
    //  0004, 004, 04, 4 will all end up being equal
    //  0,-0 are normalized to '' (zero length)
    //
    // The value is written to the output buffer
    // from the end to the start.  The return value
    // is the start of the Base100 int in the output buffer.
    //
    // As the output will be smaller than the input, arr and
    // out may refer to the same array if desired.
    //
    public static int base10toBase100SortableInt(char[] arr, int start, int end,
                                                 char[] out, int outend
                                      )
    {
      int wpos=outend;  // write position
      boolean neg=false;
      --end;  // position end pointer *on* the last char

      // read signs and leading zeros
      while (start <= end) {
        char val = arr[start];
        if (val=='-') neg=!neg;
        else if (val>='1' && val<='9') break;
        start++;
      }

      // eat whitespace on RHS?
      outer: while (start <= end) {
        switch(arr[end]) {
          case ' ':
          case '\t':
          case '\n':
          case '\r': end--; break;
          default: break outer;
        }
      }

      int hundreds=0;
      /******************************************************
       * remove RHS zero normalization since it only helps 1 in 100
       * numbers and complicates both encoding and decoding.

      // remove pairs of zeros on the RHS and keep track of
      // the count.
      while (start <= end) {
        char val = arr[end];

        if (val=='0' && start <= end) {
          val=arr[end-1];
          if (val=='0') {
            hundreds++;
            end-=2;
            continue;
          }
        }

        break;
      }
      *************************************************************/


      // now start at the end and work our way forward
      // encoding two base 10 digits into 1 base 100 digit
      while (start <= end) {
        int val = arr[end--];
        val = val - '0';
        if (start <= end) {
          int val2 = arr[end--];
          val = val + (val2 - '0')*10;
        }
        out[--wpos] = neg ? (char)(99-val) : (char)val;
      }

      /****** FUTURE: not needed for this implementation of exponent combined with sign
      // normalize all zeros to positive values
      if (wpos==outend) neg=false;
      ******/

      // adjust exponent by the number of base 100 chars written
      hundreds += outend - wpos;

      // write the exponent and sign combined
      out[--wpos] = neg ? (char)(ZERO_EXPONENT - hundreds) : (char)(ZERO_EXPONENT + hundreds);

      return outend-wpos;  // the length of the base100 int
    }

  // Converts a base100 sortable number to base10 character form
// returns number of chars written.
// At least 1 char is always written.
  public static int base100SortableIntToBase10(char[] arr, int start, int end,
                                               char[] out, int offset)
  {
    // Take care of "0" case first.  It's the only number that is represented
    // in one char.
    if (end-start == 1) {
      out[offset]='0';
      return 1;
    }

    int wpos = offset;  // write position
    boolean neg = false;
    int exp = arr[start++];
    if (exp < ZERO_EXPONENT) {
      neg=true;
      exp = ZERO_EXPONENT - exp;
      out[wpos++]='-';
    }

    boolean firstDigit=true;
    while (start < end) {
      int val = arr[start++];
      if (neg) val = 99 - val;
      // opt - if we ever want a faster version we can avoid one integer
      // divide by using fixed point math to multiply by 1/10
      // http://www.cs.uiowa.edu/~jones/bcd/divide.html
      // TIP: write a small function in gcc or cl and see what
      // the optimized assemply output looks like (and which is fastest).
      // In C you can specify "unsigned" which gives the compiler more
      // info than the Java compiler has.
      char tens = (char)(val / 10 + '0');
      if (!firstDigit || tens!='0') {  // skip leading 0
        out[wpos++] = tens;      // write tens position
      }
      out[wpos++] = (char)(val % 10 + '0');    // write ones position
      firstDigit=false;
    }

    // OPTIONAL: if trailing zeros were truncated, then this is where
    // we would restore them (compare number of chars read vs exponent)

    return wpos-offset;
  }

  public static int base10toBase10kSortableInt(char[] arr, int start, int end,
                                               char[] out, int outend
                                    )
  {
    int wpos=outend;  // write position
    boolean neg=false;
    --end;  // position end pointer *on* the last char

    // read signs and leading zeros
    while (start <= end) {
      char val = arr[start];
      if (val=='-') neg=!neg;
      else if (val>='1' && val<='9') break;
      start++;
    }

    // eat whitespace on RHS?
    outer: while (start <= end) {
      switch(arr[end]) {
        case ' ': // fallthrough
        case '\t': // fallthrough
        case '\n': // fallthrough
        case '\r': end--; break;
        default: break outer;
      }
    }

    int exp=0;

    /******************************************************
     * remove RHS zero normalization since it only helps 1 in 100
     * numbers and complicates both encoding and decoding.

    // remove pairs of zeros on the RHS and keep track of
    // the count.
    while (start <= end) {
      char val = arr[end];

      if (val=='0' && start <= end) {
        val=arr[end-1];
        if (val=='0') {
          hundreds++;
          end-=2;
          continue;
        }
      }

      break;
    }
    *************************************************************/


    // now start at the end and work our way forward
    // encoding two base 10 digits into 1 base 100 digit
    while (start <= end) {
      int val = arr[end--] - '0';          // ones
      if (start <= end) {
        val += (arr[end--] - '0')*10;      // tens
        if (start <= end) {
          val += (arr[end--] - '0')*100;    // hundreds
          if (start <= end) {
            val += (arr[end--] - '0')*1000;  // thousands
          }
        }
      }
      out[--wpos] = neg ? (char)(9999-val) : (char)val;
    }


    /****** FUTURE: not needed for this implementation of exponent combined with sign
    // normalize all zeros to positive values
    if (wpos==outend) neg=false;
    ******/

    // adjust exponent by the number of base 100 chars written
    exp += outend - wpos;

    // write the exponent and sign combined
    out[--wpos] = neg ? (char)(ZERO_EXPONENT - exp) : (char)(ZERO_EXPONENT + exp);

    return outend-wpos;  // the length of the base100 int
  }

  // Converts a base100 sortable number to base10 character form
// returns number of chars written.
// At least 1 char is always written.
  public static int base10kSortableIntToBase10(char[] arr, int start, int end,
                                               char[] out, int offset)
  {
    // Take care of "0" case first.  It's the only number that is represented
    // in one char since we don't chop trailing zeros.
    if (end-start == 1) {
      out[offset]='0';
      return 1;
    }

    int wpos = offset;  // write position
    boolean neg;
    int exp = arr[start++];
    if (exp < ZERO_EXPONENT) {
      neg=true;
      // We don't currently use exp on decoding...
      // exp = ZERO_EXPONENT - exp;
      out[wpos++]='-';
    } else {
      neg=false;
    }

    // since so many values will fall in one char, pull it
    // out of the loop (esp since the first value must
    // be special-cased to not print leading zeros.
    // integer division is still expensive, so it's best to check
    // if you actually need to do it.
    //
    // TIP: write a small function in gcc or cl and see what
    // the optimized assemply output looks like (and which is fastest).
    // In C you can specify "unsigned" which gives the compiler more
    // info than the Java compiler has.
    int val = arr[start++];
    if (neg) val = 9999 - val;

    /***
    if (val < 10) {
      out[wpos++] = (char)(val + '0');
    } else if (val < 100) {
      out[wpos++] = (char)(val/10 + '0');
      out[wpos++] = (char)(val%10 + '0');
    } else if (val < 1000) {
      out[wpos++] = (char)(val/100 + '0');
      out[wpos++] = (char)((val/10)%10 + '0');
      out[wpos++] = (char)(val%10 + '0');
    } else {
      out[wpos++] = (char)(val/1000 + '0');
      out[wpos++] = (char)((val/100)%10 + '0');
      out[wpos++] = (char)((val/10)%10 + '0');
      out[wpos++] = (char)(val % 10 + '0');
    }
    ***/

    if (val < 10) {
      out[wpos++] = (char)(val + '0');
    } else if (val < 100) {
      int div = div10(val);
      int ones = val - mul10(div); // mod 10
      out[wpos++] = (char)(div + '0');
      out[wpos++] = (char)(ones + '0');
    } else if (val < 1000) {
      int div = div10(val);
      int ones = val - mul10(div); // mod 10
      val=div;
      div = div10(val);
      int tens = val - mul10(div); // mod 10
      out[wpos++] = (char)(div + '0');
      out[wpos++] = (char)(tens + '0');
      out[wpos++] = (char)(ones + '0');
    } else {
      int div = div10(val);
      int ones = val - mul10(div); // mod 10
      val=div;
      div = div10(val);
      int tens = val - mul10(div); // mod 10
      val=div;
      div = div10(val);
      int hundreds = val - mul10(div); // mod 10

      out[wpos++] = (char)(div + '0');
      out[wpos++] = (char)(hundreds + '0');
      out[wpos++] = (char)(tens + '0');
      out[wpos++] = (char)(ones + '0');
    }


    while (start < end) {
      val = arr[start++];
      if (neg) val = 9999 - val;

      int div = div10(val);
      int ones = val - mul10(div); // mod 10
      val=div;
      div = div10(val);
      int tens = val - mul10(div); // mod 10
      val=div;
      div = div10(val);
      int hundreds = val - mul10(div); // mod 10

      /***
      int ones = val % 10;
      val /= 10;
      int tens = val!=0 ? val % 10 : 0;
      val /= 10;
      int hundreds = val!=0 ? val % 10 : 0;
      val /= 10;
      int thousands = val!=0 ? val % 10 : 0;
      ***/

      /***
      int thousands = val>=1000 ? val/1000 : 0;
      int hundreds  = val>=100 ? (val/100)%10 : 0;
      int tens      = val>=10 ? (val/10)%10 : 0;
      int ones      = val % 10;
      ***/

      /***
      int thousands =  val/1000;
      int hundreds  = (val/100)%10;
      int tens      = (val/10)%10;
      int ones      = val % 10;
      ***/

      out[wpos++] = (char)(div + '0');
      out[wpos++] = (char)(hundreds + '0');
      out[wpos++] = (char)(tens + '0');
      out[wpos++] = (char)(ones + '0');
    }

    // OPTIONAL: if trailing zeros were truncated, then this is where
    // we would restore them (compare number of chars read vs exponent)

    return wpos-offset;
  }



}