package net.varkhan.base.conversion.character; import net.varkhan.base.conversion.serializer.DecodingException; import net.varkhan.base.conversion.serializer.EncodingException; import java.util.Arrays; /** * <b></b>. * <p/> * * @author varkhan * @date 1/11/15 * @time 4:05 PM */ public class UTF8String implements CharSequence, java.io.Serializable { protected final byte[] data; public UTF8String(byte[] data, int start, int end) { // Use charLen to validate the data idxAt(data, start, end); this.data=new byte[end-start]; System.arraycopy(data, start, this.data, 0, end-start); } public UTF8String(byte[] data) { this(data, 0, data.length); } public UTF8String(char[] chars, int start, int end) { data= new byte[(int)bytesLen(chars, start, end)]; bytes(chars, start, end, data, 0, data.length); } public UTF8String(CharSequence str, int start, int end) { data= new byte[(int)bytesLen(str, start, end)]; bytes(str, start, end, data, 0, data.length); } @Override public int length() { return (int) idxAt(data, 0, data.length); } @Override public char charAt(int index) { return (char) charAt(data, posAt(data, 0, index)); } @Override public UTF8String subSequence(int start, int end) { long bs = posAt(data,0,start); long be = posAt(data,bs,end-start); return new UTF8String(data, (int)bs, (int)be); } @Override public boolean equals(Object o) { if(this==o) return true; if(!(o instanceof CharSequence)) return false; if(o instanceof UTF8String) { UTF8String that=(UTF8String) o; return Arrays.equals(this.data,that.data); } else { CharSequence that=(CharSequence) o; return compare(data, 0, data.length, that)==0; } } @Override public int hashCode() { int h = 0; for(byte c: data) h=31*h+c; return h; } @Override public String toString() { char[] c = new char[(int) idxAt(data, 0, data.length)]; chars(c, 0, c.length, data, 0, data.length); return new String(c); } public byte[] getBytes() { return data.clone(); } public int indexOf(int chr) { return indexOf(chr,0); } public int indexOf(int chr, int pos) { long beg=posAt(data, 0, pos); long idx=indexOf(data, beg, data.length, chr); return idx<0 ? -1 : (int) idx+pos; } public int indexOf(CharSequence str) { return indexOf(str, 0); } public int indexOf(CharSequence str, int pos) { long beg=posAt(data, 0, pos); long idx=indexOf(data, beg, data.length, str); return idx<0 ? -1 : (int) idx+pos; } /********************************************************************************** ** Static methods operating on UTF8 byte arrays **/ public static long idxAt(byte[] dat, long beg, long end) { int p = (int)beg; int i=0; while(p<end) { try { int b=0xFF&dat[p++]; if(b<0x80) { i ++; } else if(b<0xC0) { throw new DecodingException("Incomplete Unicode sequence"); } else if(b<0xE0) { i ++; p ++; } else if(b<0xF0) { i ++; p += 2; } else if(b<0xF8) { i ++; p += 3; } else { throw new DecodingException("Incomplete Unicode sequence"); } } catch(ArrayIndexOutOfBoundsException e) { throw new IndexOutOfBoundsException("Character index "+i+" out of data bounds "+p); } } return i; } public static long posAt(byte[] dat, long beg, long idx) { int p=(int) beg; long i=0; while(i<idx) { try { int b=0xFF&dat[p++]; if(b<0x80) { i ++; } else if(b<0xC0) { throw new DecodingException("Incomplete Unicode sequence"); } else if(b<0xE0) { i ++; p ++; } else if(b<0xF0) { i ++; p += 2; } else if(b<0xF8) { i ++; p += 3; } else { throw new DecodingException("Incomplete Unicode sequence"); } } catch(ArrayIndexOutOfBoundsException e) { throw new IndexOutOfBoundsException("Character index "+idx+" out of data bounds "+p); } } return p; } public static int charAt(byte[] dat, long beg) { int p=(int) beg; try { int b=0xFF&dat[p++]; if(b<0x80) { return b&0x7F; } else if(b<0xC0) { throw new DecodingException("Incomplete Unicode sequence"); } else if(b<0xE0) { return ((b&0x1F)<<6)|(dat[p++]&0x3F); } else if(b<0xF0) { return ((b&0x0F)<<12)|((dat[p++]&0x3F)<<6)|(dat[p++]&0x3F); } else if(b<0xF8) { return ((b&0x07)<<18)|((dat[p++]&0x3F)<<12)|((dat[p++]&0x3F)<<6)|(dat[p++]&0x3F); } else { throw new DecodingException("Incomplete Unicode sequence"); } } catch(ArrayIndexOutOfBoundsException e) { throw new DecodingException("Incomplete Unicode sequence"); } } public static long indexOf(byte[] dat, long beg, long end, int chr) { int p=(int) beg; long i=0; while(p<end) { try { int b=0xFF&dat[p++]; if(b<0x80) { if(chr == (b&0x7F)) return i; i ++; } else if(b<0xC0) { throw new DecodingException("Incomplete Unicode sequence"); } else if(b<0xE0) { if(chr == (((b&0x1F)<<6)|(dat[p++]&0x3F))) return i; i ++; } else if(b<0xF0) { if(chr == (((b&0x0F)<<12)|((dat[p++]&0x3F)<<6)|(dat[p++]&0x3F))) return i; i ++; } else if(b<0xF8) { if(chr == (((b&0x07)<<18)|((dat[p++]&0x3F)<<12)|((dat[p++]&0x3F)<<6)|(dat[p++]&0x3F))) return i; i ++; } else { throw new DecodingException("Incomplete Unicode sequence"); } } catch(ArrayIndexOutOfBoundsException e) { throw new IndexOutOfBoundsException("Character index out of data bounds "+p); } } return -1L; } // public static long compare(byte[] dat, long pos, long len, char[] str, long cpos, long clen) { // int p = (int) pos; // int q = (int) cpos; // clen += cpos; // while(p<len && q<clen) { // try { // int b=0xFF&dat[p++]; // if(b<0x80) { // int d = str[q++] - (b&0x7F); // if(d!=0) return d; // } // else if(b<0xC0) { throw new DecodingException("Incomplete Unicode sequence"); } // else if(b<0xE0) { // int d = str[q++] - (((b&0x1F)<<6)|(dat[p++]&0x3F)); // if(d!=0) return d; // } // else if(b<0xF0) { // int d = str[q++] - (((b&0x0F)<<12)|((dat[p++]&0x3F)<<6)|(dat[p++]&0x3F)); // if(d!=0) return d; // } // else if(b<0xF8) { // int d = str[q++] - (((b&0x07)<<18)|((dat[p++]&0x3F)<<12)|((dat[p++]&0x3F)<<6)|(dat[p++]&0x3F)); // if(d!=0) return d; // } // else { throw new DecodingException("Incomplete Unicode sequence"); } // } // catch(ArrayIndexOutOfBoundsException e) { // throw new DecodingException("Incomplete Unicode sequence") // } // } // return (clen-q) - (len-p); // } public static long compare(byte[] dat, long pos, long len, CharSequence str) { int p = (int) pos; int q = 0; int clen=str.length(); while(p<len && q<clen) { try { int b=0xFF&dat[p++]; int d=0; int c=str.charAt(q++); if(b<0x80) { d = c - (b&0x7F); } else if(b<0xC0) { throw new DecodingException("Incomplete Unicode sequence"); } else if(b<0xE0) { d = c - (((b&0x1F)<<6)|(dat[p++]&0x3F)); } else if(b<0xF0) { d = c - (((b&0x0F)<<12)|((dat[p++]&0x3F)<<6)|(dat[p++]&0x3F)); } else if(b<0xF8) { d = c - (((b&0x07)<<18)|((dat[p++]&0x3F)<<12)|((dat[p++]&0x3F)<<6)|(dat[p++]&0x3F)); } else { throw new DecodingException("Incomplete Unicode sequence"); } if(d!=0) return d; } catch(ArrayIndexOutOfBoundsException e) { throw new DecodingException("Incomplete Unicode sequence"); } } return (clen-q) - (len-p); } public static long indexOf(byte[] dat, long beg, long end, CharSequence str) { int p=(int) beg; long i=0; int clen=str.length(); match: while(p<end) { int q=0; int c=str.charAt(q++); try { int b=0xFF&dat[p++]; int d=0; if(b<0x80) { d = c - (b&0x7F); } else if(b<0xC0) { throw new DecodingException("Incomplete Unicode sequence"); } else if(b<0xE0) { d = c - (((b&0x1F)<<6)|(dat[p++]&0x3F)); } else if(b<0xF0) { d = c - (((b&0x0F)<<12)|((dat[p++]&0x3F)<<6)|(dat[p++]&0x3F)); } else if(b<0xF8) { d = c - (((b&0x07)<<18)|((dat[p++]&0x3F)<<12)|((dat[p++]&0x3F)<<6)|(dat[p++]&0x3F)); } else { throw new DecodingException("Incomplete Unicode sequence"); } if(d!=0) { i++; continue match; } } catch(ArrayIndexOutOfBoundsException e) { throw new DecodingException("Incomplete Unicode sequence"); } int pq = p; while(pq<clen) { // Too few chars remaining? not found if(pq>=end) return -1L; c = str.charAt(q++); try { int b=0xFF&dat[pq++]; int d=0; if(b<0x80) { d = c - (b&0x7F); } else if(b<0xC0) { throw new DecodingException("Incomplete Unicode sequence"); } else if(b<0xE0) { d = c - (((b&0x1F)<<6)|(dat[pq++]&0x3F)); } else if(b<0xF0) { d = c - (((b&0x0F)<<12)|((dat[pq++]&0x3F)<<6)|(dat[pq++]&0x3F)); } else if(b<0xF8) { d = c - (((b&0x07)<<18)|((dat[pq++]&0x3F)<<12)|((dat[pq++]&0x3F)<<6)|(dat[pq++]&0x3F)); } else { throw new DecodingException("Incomplete Unicode sequence"); } if(d!=0) { i++; continue match; } } catch(ArrayIndexOutOfBoundsException e) { throw new DecodingException("Incomplete Unicode sequence"); } } return i; } return -1L; } /********************************************************************************** ** Static methods operating on char arrays **/ public static long bytesLen(char[] obj, long cpos, long clen) { int len=0; for(int i=(int)cpos;i<clen;i++) { char c=obj[i]; if(c<0x80) len++; else if(c<0x800) len+=2; else if(c<0x10000) len+=3; else clen+=4; } return len; } public static long bytes(char[] obj, long cpos, long clen, byte[] dat, long pos, long len) { try { int p=(int) pos; int i=(int) cpos; while(i<clen) { if(p-pos>=len) return p-pos; char c=obj[i++]; if(c<0x80) { dat[p++]=(byte) (0x7F&c); } else if(c<0x800) { dat[p++]=(byte) (0xC0|(0x1F&(c>>>6))); if(p-pos>=len) throw new EncodingException(); dat[p++]=(byte) (0x80|(0x3F&c)); } else if(c<0x10000) { dat[p++]=(byte) (0xE0|(0x0F&(c>>>12))); if(p-pos>=len) throw new EncodingException(); dat[p++]=(byte) (0x80|(0x3F&(c>>>6))); if(p-pos>=len) throw new EncodingException(); dat[p++]=(byte) (0x80|(0x3F&c)); } else { dat[p++]=(byte) (0xF0|(0x07&(c>>>18))); if(p-pos>=len) throw new EncodingException(); dat[p++]=(byte) (0x80|(0x3F&(c>>>12))); if(p-pos>=len) throw new EncodingException(); dat[p++]=(byte) (0x80|(0x3F&(c>>>6))); if(p-pos>=len) throw new EncodingException(); dat[p++]=(byte) (0x80|(0x3F&c)); } } return p-pos; } catch(ArrayIndexOutOfBoundsException e) { throw new EncodingException(e); } } public static long bytesLen(CharSequence obj, long cpos, long clen) { int len=0; for(int i=(int)cpos;i<clen;i++) { char c=obj.charAt(i); if(c<0x80) len++; else if(c<0x800) len+=2; else if(c<0x10000) len+=3; else clen+=4; } return len; } public static long bytes(CharSequence obj, long cpos, long clen, byte[] dat, long pos, long len) { try { int p=(int) pos; int i=(int) cpos; while(i<clen) { if(p-pos>=len) return p-pos; char c=obj.charAt(i++); if(c<0x80) { dat[p++]=(byte) (0x7F&c); } else if(c<0x800) { dat[p++]=(byte) (0xC0|(0x1F&(c>>>6))); if(p-pos>=len) throw new EncodingException(); dat[p++]=(byte) (0x80|(0x3F&c)); } else if(c<0x10000) { dat[p++]=(byte) (0xE0|(0x0F&(c>>>12))); if(p-pos>=len) throw new EncodingException(); dat[p++]=(byte) (0x80|(0x3F&(c>>>6))); if(p-pos>=len) throw new EncodingException(); dat[p++]=(byte) (0x80|(0x3F&c)); } else { dat[p++]=(byte) (0xF0|(0x07&(c>>>18))); if(p-pos>=len) throw new EncodingException(); dat[p++]=(byte) (0x80|(0x3F&(c>>>12))); if(p-pos>=len) throw new EncodingException(); dat[p++]=(byte) (0x80|(0x3F&(c>>>6))); if(p-pos>=len) throw new EncodingException(); dat[p++]=(byte) (0x80|(0x3F&c)); } } return p-pos; } catch(ArrayIndexOutOfBoundsException e) { throw new EncodingException(e); } } public static long chars(char[] out, long cpos, long clen, byte[] dat, long pos, long len) { int p = (int)pos; int q = (int) cpos; clen += cpos; while(p<len && q<clen) { try { int b=0xFF&dat[p++]; if(b<0x80) { out[q++]=(char) (b&0x7F); } else if(b<0xC0) { throw new DecodingException("Incomplete Unicode sequence"); } else if(b<0xE0) { out[q++]=(char) (((b&0x1F)<<6)|(dat[p++]&0x3F)); } else if(b<0xF0) { out[q++]=(char) (((b&0x0F)<<12)|((dat[p++]&0x3F)<<6)|(dat[p++]&0x3F)); } else if(b<0xF8) { out[q++]=(char) (((b&0x07)<<18)|((dat[p++]&0x3F)<<12)|((dat[p++]&0x3F)<<6)|(dat[p++]&0x3F)); } else { throw new DecodingException("Incomplete Unicode sequence"); } } catch(ArrayIndexOutOfBoundsException e) { throw new DecodingException(e); } } return q-cpos; } }