/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.util; /** * @version $Id: BCDUtils.java 555343 2007-07-11 17:46:25Z hossman $ */ public class BCDUtils { // idiv is expensive... // use fixed point math to multiply by 1/10 // http://www.cs.uiowa.edu/~jones/bcd/divide.html private static int div10(int a) { return (a * 0xcccd) >>> 19; } private static int mul10(int a) { return (a*10); } // private static int mul10(int a) { return ((a<<3)+(a<<1)); } // private static int mul10(int a) { return (a+(a<<2))<<1; } // attempt to use LEA instr // (imul32 on AMD64 only has a 3 cycle latency in any case) // something that won't clash with other base100int // chars (something >= 100) private static final char NEG_CHAR=(char)126; // The zero exponent. // NOTE: for smaller integer representations, this current implementation // combines sign and exponent into the first char. sign is negative if // exponent is less than the zero point (no negative exponents themselves) private static final int ZERO_EXPONENT='a'; // 97 // WARNING: assumption is that this is a legal int... // no validation is done. [+-]?digit* // // Normalization of zeros *is* done... // 0004, 004, 04, 4 will all end up being equal // 0,-0 are normalized to '' (zero length) // // The value is written to the output buffer // from the end to the start. The return value // is the start of the Base100 int in the output buffer. // // As the output will be smaller than the input, arr and // out may refer to the same array if desired. // public static int base10toBase100(char[] arr, int start, int end, char[] out, int outend ) { int wpos=outend; // write position boolean neg=false; while (--end >= start) { int val = arr[end]; if (val=='+') { break; } else if (val=='-') { neg=!neg; break; } else { val = val - '0'; if (end > start) { int val2 = arr[end-1]; if (val2=='+') { out[--wpos]=(char)val; break; } if (val2=='-') { out[--wpos]=(char)val; neg=!neg; break; } end--; val = val + (val2 - '0')*10; } out[--wpos] = (char)val; } } // remove leading base100 zeros while (wpos<outend && out[wpos]==0) wpos++; // check for a zero value if (wpos==outend) { // if zero, don't add negative sign } else if (neg) { out[--wpos]=NEG_CHAR; } return wpos; // the start of the base100 int } // Converts a base100 number to base10 character form // returns number of chars written. // At least 1 char is always written. public static int base100toBase10(char[] arr, int start, int end, char[] out, int offset) { int wpos=offset; // write position boolean firstDigit=true; for (int i=start; i<end; i++) { int val = arr[i]; if (val== NEG_CHAR) { out[wpos++]='-'; continue; } char tens = (char)(val / 10 + '0'); if (!firstDigit || tens!='0') { // skip leading 0 out[wpos++] = (char)(val / 10 + '0'); // tens position } out[wpos++] = (char)(val % 10 + '0'); // ones position firstDigit=false; } if (firstDigit) out[wpos++]='0'; return wpos-offset; } public static String base10toBase100SortableInt(String val) { char[] arr = new char[val.length()+1]; val.getChars(0,val.length(),arr,0); int len = base10toBase100SortableInt(arr,0,val.length(),arr,arr.length); return new String(arr,arr.length-len,len); } public static String base100SortableIntToBase10(String val) { int slen = val.length(); char[] arr = new char[slen<<2]; val.getChars(0,slen,arr,0); int len = base100SortableIntToBase10(arr,0,slen,arr,slen); return new String(arr,slen,len); } public static String base10toBase10kSortableInt(String val) { char[] arr = new char[val.length()+1]; val.getChars(0,val.length(),arr,0); int len = base10toBase10kSortableInt(arr,0,val.length(),arr,arr.length); return new String(arr,arr.length-len,len); } public static String base10kSortableIntToBase10(String val) { int slen = val.length(); char[] arr = new char[slen*5]; // +1 time for orig, +4 for new val.getChars(0,slen,arr,0); int len = base10kSortableIntToBase10(arr,0,slen,arr,slen); return new String(arr,slen,len); } /********* FUTURE // the zero exponent... exponents above this point are positive // and below are negative. // It is desirable to make ordinary numbers have a single byte // exponent when converted to UTF-8 // For integers, the exponent will always be >=0, but this format // is meant to be valid for floating point numbers as well... private static final int ZERO_EXPONENT='a'; // 97 // if exponent is larger than what can be represented // in a single byte (char), then this is the multibyte // escape char. // UCS-2 surrogates start at 0xD800 private static final int POSITIVE_EXPONENT_ESCAPE=0x3fff; // if exponent is smaller than what can be represented in // a single byte, then this is the multibyte escape private static final int NEGATIVE_EXPONENT_ESCAPE=1; // if number is negative, it starts with this optional value // this should not overlap with any exponent values private static final int NEGATIVE_SIGN=0; **********/ // WARNING: assumption is that this is a legal int... // no validation is done. [+-]?digit* // // Normalization of zeros *is* done... // 0004, 004, 04, 4 will all end up being equal // 0,-0 are normalized to '' (zero length) // // The value is written to the output buffer // from the end to the start. The return value // is the start of the Base100 int in the output buffer. // // As the output will be smaller than the input, arr and // out may refer to the same array if desired. // public static int base10toBase100SortableInt(char[] arr, int start, int end, char[] out, int outend ) { int wpos=outend; // write position boolean neg=false; --end; // position end pointer *on* the last char // read signs and leading zeros while (start <= end) { char val = arr[start]; if (val=='-') neg=!neg; else if (val>='1' && val<='9') break; start++; } // eat whitespace on RHS? outer: while (start <= end) { switch(arr[end]) { case ' ': case '\t': case '\n': case '\r': end--; break; default: break outer; } } int hundreds=0; /****************************************************** * remove RHS zero normalization since it only helps 1 in 100 * numbers and complicates both encoding and decoding. // remove pairs of zeros on the RHS and keep track of // the count. while (start <= end) { char val = arr[end]; if (val=='0' && start <= end) { val=arr[end-1]; if (val=='0') { hundreds++; end-=2; continue; } } break; } *************************************************************/ // now start at the end and work our way forward // encoding two base 10 digits into 1 base 100 digit while (start <= end) { int val = arr[end--]; val = val - '0'; if (start <= end) { int val2 = arr[end--]; val = val + (val2 - '0')*10; } out[--wpos] = neg ? (char)(99-val) : (char)val; } /****** FUTURE: not needed for this implementation of exponent combined with sign // normalize all zeros to positive values if (wpos==outend) neg=false; ******/ // adjust exponent by the number of base 100 chars written hundreds += outend - wpos; // write the exponent and sign combined out[--wpos] = neg ? (char)(ZERO_EXPONENT - hundreds) : (char)(ZERO_EXPONENT + hundreds); return outend-wpos; // the length of the base100 int } // Converts a base100 sortable number to base10 character form // returns number of chars written. // At least 1 char is always written. public static int base100SortableIntToBase10(char[] arr, int start, int end, char[] out, int offset) { // Take care of "0" case first. It's the only number that is represented // in one char. if (end-start == 1) { out[offset]='0'; return 1; } int wpos = offset; // write position boolean neg = false; int exp = arr[start++]; if (exp < ZERO_EXPONENT) { neg=true; exp = ZERO_EXPONENT - exp; out[wpos++]='-'; } boolean firstDigit=true; while (start < end) { int val = arr[start++]; if (neg) val = 99 - val; // opt - if we ever want a faster version we can avoid one integer // divide by using fixed point math to multiply by 1/10 // http://www.cs.uiowa.edu/~jones/bcd/divide.html // TIP: write a small function in gcc or cl and see what // the optimized assemply output looks like (and which is fastest). // In C you can specify "unsigned" which gives the compiler more // info than the Java compiler has. char tens = (char)(val / 10 + '0'); if (!firstDigit || tens!='0') { // skip leading 0 out[wpos++] = tens; // write tens position } out[wpos++] = (char)(val % 10 + '0'); // write ones position firstDigit=false; } // OPTIONAL: if trailing zeros were truncated, then this is where // we would restore them (compare number of chars read vs exponent) return wpos-offset; } public static int base10toBase10kSortableInt(char[] arr, int start, int end, char[] out, int outend ) { int wpos=outend; // write position boolean neg=false; --end; // position end pointer *on* the last char // read signs and leading zeros while (start <= end) { char val = arr[start]; if (val=='-') neg=!neg; else if (val>='1' && val<='9') break; start++; } // eat whitespace on RHS? outer: while (start <= end) { switch(arr[end]) { case ' ': // fallthrough case '\t': // fallthrough case '\n': // fallthrough case '\r': end--; break; default: break outer; } } int exp=0; /****************************************************** * remove RHS zero normalization since it only helps 1 in 100 * numbers and complicates both encoding and decoding. // remove pairs of zeros on the RHS and keep track of // the count. while (start <= end) { char val = arr[end]; if (val=='0' && start <= end) { val=arr[end-1]; if (val=='0') { hundreds++; end-=2; continue; } } break; } *************************************************************/ // now start at the end and work our way forward // encoding two base 10 digits into 1 base 100 digit while (start <= end) { int val = arr[end--] - '0'; // ones if (start <= end) { val += (arr[end--] - '0')*10; // tens if (start <= end) { val += (arr[end--] - '0')*100; // hundreds if (start <= end) { val += (arr[end--] - '0')*1000; // thousands } } } out[--wpos] = neg ? (char)(9999-val) : (char)val; } /****** FUTURE: not needed for this implementation of exponent combined with sign // normalize all zeros to positive values if (wpos==outend) neg=false; ******/ // adjust exponent by the number of base 100 chars written exp += outend - wpos; // write the exponent and sign combined out[--wpos] = neg ? (char)(ZERO_EXPONENT - exp) : (char)(ZERO_EXPONENT + exp); return outend-wpos; // the length of the base100 int } // Converts a base100 sortable number to base10 character form // returns number of chars written. // At least 1 char is always written. public static int base10kSortableIntToBase10(char[] arr, int start, int end, char[] out, int offset) { // Take care of "0" case first. It's the only number that is represented // in one char since we don't chop trailing zeros. if (end-start == 1) { out[offset]='0'; return 1; } int wpos = offset; // write position boolean neg; int exp = arr[start++]; if (exp < ZERO_EXPONENT) { neg=true; // We don't currently use exp on decoding... // exp = ZERO_EXPONENT - exp; out[wpos++]='-'; } else { neg=false; } // since so many values will fall in one char, pull it // out of the loop (esp since the first value must // be special-cased to not print leading zeros. // integer division is still expensive, so it's best to check // if you actually need to do it. // // TIP: write a small function in gcc or cl and see what // the optimized assemply output looks like (and which is fastest). // In C you can specify "unsigned" which gives the compiler more // info than the Java compiler has. int val = arr[start++]; if (neg) val = 9999 - val; /*** if (val < 10) { out[wpos++] = (char)(val + '0'); } else if (val < 100) { out[wpos++] = (char)(val/10 + '0'); out[wpos++] = (char)(val%10 + '0'); } else if (val < 1000) { out[wpos++] = (char)(val/100 + '0'); out[wpos++] = (char)((val/10)%10 + '0'); out[wpos++] = (char)(val%10 + '0'); } else { out[wpos++] = (char)(val/1000 + '0'); out[wpos++] = (char)((val/100)%10 + '0'); out[wpos++] = (char)((val/10)%10 + '0'); out[wpos++] = (char)(val % 10 + '0'); } ***/ if (val < 10) { out[wpos++] = (char)(val + '0'); } else if (val < 100) { int div = div10(val); int ones = val - mul10(div); // mod 10 out[wpos++] = (char)(div + '0'); out[wpos++] = (char)(ones + '0'); } else if (val < 1000) { int div = div10(val); int ones = val - mul10(div); // mod 10 val=div; div = div10(val); int tens = val - mul10(div); // mod 10 out[wpos++] = (char)(div + '0'); out[wpos++] = (char)(tens + '0'); out[wpos++] = (char)(ones + '0'); } else { int div = div10(val); int ones = val - mul10(div); // mod 10 val=div; div = div10(val); int tens = val - mul10(div); // mod 10 val=div; div = div10(val); int hundreds = val - mul10(div); // mod 10 out[wpos++] = (char)(div + '0'); out[wpos++] = (char)(hundreds + '0'); out[wpos++] = (char)(tens + '0'); out[wpos++] = (char)(ones + '0'); } while (start < end) { val = arr[start++]; if (neg) val = 9999 - val; int div = div10(val); int ones = val - mul10(div); // mod 10 val=div; div = div10(val); int tens = val - mul10(div); // mod 10 val=div; div = div10(val); int hundreds = val - mul10(div); // mod 10 /*** int ones = val % 10; val /= 10; int tens = val!=0 ? val % 10 : 0; val /= 10; int hundreds = val!=0 ? val % 10 : 0; val /= 10; int thousands = val!=0 ? val % 10 : 0; ***/ /*** int thousands = val>=1000 ? val/1000 : 0; int hundreds = val>=100 ? (val/100)%10 : 0; int tens = val>=10 ? (val/10)%10 : 0; int ones = val % 10; ***/ /*** int thousands = val/1000; int hundreds = (val/100)%10; int tens = (val/10)%10; int ones = val % 10; ***/ out[wpos++] = (char)(div + '0'); out[wpos++] = (char)(hundreds + '0'); out[wpos++] = (char)(tens + '0'); out[wpos++] = (char)(ones + '0'); } // OPTIONAL: if trailing zeros were truncated, then this is where // we would restore them (compare number of chars read vs exponent) return wpos-offset; } }