package org.shanbo.feluca.data2.util; import java.util.Arrays; /** * Split text line by non-digit chars; * digit related chars include : '0'~'9', '.', '+-' * <p><b>WARNING: not support for custom split;</b> * <p><b>WARNING: not thread safe!</b> * @author lgn * */ public class NumericTokenizer { final static int[] POWERS_OF_10 = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000}; public class FeatureWeight{ long kv; public FeatureWeight(long kv) { this.kv = kv; } public void setKV(long kv){ this.kv = kv; } public int getId(){ return (int) ((kv & 0xffffffff00000000l) >>> 32); } public float getWeight(){ int w = (int)(kv & 0xffffffff); return Float.intBitsToFloat(w); } } public class FeatureFieldWeight{ int fid; int field; float weight; public int getFid(){ return fid; } public int getField() { return field; } public float getWeight(){ return weight; } void set(int fid, int field, float weight){ this.fid = fid; this.field = field; this.weight = weight; } } class ByteArray { final static int MAX_ENLARGE_SIZE = 1024*1024*8; public byte[] array; public int startIdx; public int endIdx; public ByteArray(byte[] array){ this(array, 0, array.length); } public ByteArray(byte[] array, int start, int end){ this.array = array; this.setSegment(start, end); } public void setSegment(int start, int end){ this.startIdx = start; this.endIdx = end; } public void add(byte elem){ } public ByteArray(int size ){ this.array = new byte[size]; this.setSegment(0, size); } public int capacity(){ return array.length - startIdx; } public int size(){ return endIdx - startIdx ; } public byte quickGet(int idx){ return array[idx + startIdx]; } public byte get(int idx){ if (idx + startIdx >= endIdx) throw new IndexOutOfBoundsException(); return array[idx + startIdx]; } public String toString(){ return new String(array, startIdx, endIdx - startIdx); } } byte[] delimeters; byte[] asciiTable = new byte[128]; ByteArray line; FeatureWeight keyWeight; FeatureFieldWeight ffWeight; int digitIdx = 0; // a token's cursor int bound = 0; private void init(byte[] delims){ Arrays.fill(asciiTable, (byte)-3); delimeters = delims; asciiTable[48] = 0; asciiTable[49] = 1; asciiTable[50] = 2; asciiTable[51] = 3; asciiTable[52] = 4; asciiTable[53] = 5; asciiTable[54] = 6; asciiTable[55] = 7; asciiTable[56] = 8; asciiTable[57] = 9; asciiTable[46] = -1; // decimal point asciiTable[58] = -3; // ':' asciiTable[13] = -2; // endl asciiTable[10] = -2; // endl asciiTable[43] = 100;// '+' asciiTable[45] = 100;// '-' for(int i = 0; i < delimeters.length; i++){ asciiTable[delimeters[i]] = -3; //split } } /** * split by [tab] or [space] */ public NumericTokenizer(){ init( new byte[]{(byte)32, (byte)9}); } // @Deprecated // public NumericTokenizer(byte... delims){ // init(delims); // } // @Deprecated // public NumericTokenizer(char... delims){ // byte[] delim = new byte[delims.length]; // for(int i = 0 ; i < delim.length ; i ++) // delim[i] =( byte)delims[i]; // init(delim); // } /** * for parse a new line; * @param ba */ public void load(ByteArray ba){ this.line = ba; this.digitIdx = 0; this.bound = line.size(); } public void load(String line){ load( new ByteArray(line.getBytes())); } public boolean hasNext(){ if (digitIdx >= bound){ return false; } return true; } /** * return a number * @return Integer or Float */ public Number nextNumber(){ long value = 0; boolean isFloat = false; int dpIndex = 0; boolean isNegative = false; byte k = line.quickGet(digitIdx); for(; digitIdx < bound && asciiTable[line.quickGet(digitIdx)] > -2 ; digitIdx += 1){ k = line.quickGet(digitIdx); if (k == 43){ // '+' // digitIdx += 1; continue; }else if (k == 45){ // '-' isNegative = true; // digitIdx += 1; continue; } if (asciiTable[k] == -1){ isFloat = true; dpIndex = digitIdx; }else value = value * 10 + asciiTable[k]; // digitIdx += 1; } int endIdx = digitIdx; // for(; digitIdx < bound && asciiTable[line.quickGet(digitIdx)] <= -2 ; digitIdx += 1){ ; } if (!isNegative) if (isFloat){ if (endIdx - dpIndex - 1 < 10) return new Float((value + 0.0) / POWERS_OF_10[endIdx - dpIndex - 1]); else return new Float(value / Math.pow(10, endIdx - dpIndex - 1)); }else{ return new Integer((int)value); } else if (isFloat){ if (endIdx - dpIndex - 1 < 10) return new Float(-(value + 0.0) / POWERS_OF_10[endIdx - dpIndex - 1]); else return new Float(-value / Math.pow(10, endIdx - dpIndex - 1)); }else{ return new Integer(-(int)value); } } /** * 64 bits for key:value pair * first 32 bits(integer) for id, last 32 bits(float) for weight * @return 64 bits(long) for key:value pair */ private long nextKeyValuePair(){ long id = (Integer)nextNumber(); Object w = nextNumber(); Float weight = (w instanceof Float )? (Float)w : (Integer)w; long result = (((long)id) << 32) | (Float.floatToIntBits(weight) & 0xffffffffl); return result; } public FeatureWeight nextKeyWeight(){ long got = nextKeyValuePair(); if (keyWeight == null){ keyWeight = new FeatureWeight(got); }else{ keyWeight.setKV(got); } return keyWeight; } public FeatureFieldWeight nextFFW(){ int fid = (Integer)nextNumber(); int field = (Integer)nextNumber(); Object w = nextNumber(); Float weight = (w instanceof Float )? (Float)w : (Integer)w; if (ffWeight == null){ ffWeight = new FeatureFieldWeight(); } ffWeight.set(fid, field, weight); return ffWeight; } // public static int extractFeatureId(long kv){ // return (int) ((kv & 0xffffffff00000000l) >>> 32); // } // // public static float extractWeight(long kv){ // int w = (int)(kv & 0xffffffff); // return Float.intBitsToFloat(w); // } }