utf8.java example

Explorer
robonobo-master
 /* 
  *  `gnu.iou' 
  *  Copyright (C) 2006 John Pritchard.
  *
  *  This program is free software; you can redistribute it and/or
  *  modify it under the terms of the GNU General Public License as
  *  published by the Free Software Foundation; either version 2 of
  *  the License, or (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  *  02111-1307 USA
  */
package gnu.iou ;

/**
 * <p> This class contains static tools for doing UTF-8 encoding and
 * decoding.</p>
 *
 * <p> UTF-8 is ASCII- transparent.  It supports character sets
 * requiring more than the seven bit ASCII base range of UTF-8,
 * including Unicode, ISO-8859, ISO-10646, etc..</p>
 * 
 * <p> We do not use an ISO UCS code signature, and we do not use a
 * Java Data I/O- style strlen prefix.  </p>
 *
 * @author John Pritchard (jdp@syntelos)
 */
public abstract class utf8 {

    /**
     * Decode UTF-8 input, terminates decoding at a null character,
     * value 0x0.
     * 
     * @exception IllegalStateException Bad format.
     */
    public final static char[] decode( byte[] code){

	if ( null == code) return null;

	return decode(code,0,code.length);
    }
    /**
     * Decode UTF-8 input, terminates decoding at a null character,
     * value 0x0.
     * 
     * @exception IllegalStateException Bad format.
     */
    public final static char[] decode( byte[] code, int off, int many){

	if ( null == code || 0 >= code.length) 
	    return null;
	else {
	    chbuf strbuf = new chbuf(code.length);
	    int trm = (off+many);
	    int ch, ch2, ch3;
	    char tmpc;
	    for ( int cc = off; cc < trm; ){
		ch = (code[cc]&0xff);
		switch (ch >> 4) {
		case 0:
		case 1:
		case 2:
		case 3:
		case 4:
		case 5:
		case 6:
		case 7:
		    cc += 1;
		    tmpc = (char)ch; // for debugging
		    strbuf.append(tmpc);
		    break;
		case 12: 
		case 13:
		    cc += 2;
		    if (cc > trm)
			throw new IllegalStateException();
		    else {
			ch2 = (int) (code[cc-1]&0xff);
			if (0x80 != (ch2 & 0xC0))
			    throw new IllegalStateException();
			else {
			    tmpc = (char)(((ch & 0x1F) <<6)|(ch2 & 0x3F));
			    strbuf.append(tmpc);
			}
		    }
		    break;
		case 14:
		    cc += 3;
		    if (cc > trm)
			throw new IllegalStateException();
		    else {
			ch2 = (code[cc-2]&0xff);
			ch3 = (code[cc-1]&0xff);
			if ((0x80 != (ch2 & 0xC0)) || (0x80 != (ch3 & 0xC0)))
			    throw new IllegalStateException();
			else {
			    tmpc = (char)(((ch  & 0x0F) << 12)|
					  ((ch2 & 0x3F) << 6) |
					  ((ch3 & 0x3F) << 0));
			    strbuf.append(tmpc);
			}
		    }
		    break;
		default:
		    throw new IllegalStateException();		  
		}
	    }
	    return strbuf.toCary();
	}
    }

    /**
     * Encode string in UTF-8.
     */
    public final static byte[] encode( char[] str){

	if ( null == str || 0 >= str.length) return null;

	bbuf bytbuf = encode( str, null);

	return bytbuf.toByteArray();
    }

    /**
     * Encode string in UTF-8.
     */
    public final static bbuf encode( char[] str, bbuf bytbuf){

	if ( null == bytbuf) bytbuf = new bbuf( str.length);

	if ( null == str || 0 >= str.length) return bytbuf;

	char ch, sch;

	for ( int cc = 0, len = str.length; cc < len; cc++){

	    ch = str[cc];
	    if ((0x0 < ch) && (0x80 > ch))
		bytbuf.write(ch);
	    else if (0x07FF < ch){
		bytbuf.write(0xE0 | ((ch >> 12) & 0x0F));
		bytbuf.write(0x80 | ((ch >>  6) & 0x3F));
		bytbuf.write(0x80 | (ch & 0x3F));
	    }
	    else {
		bytbuf.write(0xC0 | ((ch >>  6) & 0x1F));
		bytbuf.write(0x80 | (ch & 0x3F));
	    }
	}
	return bytbuf;
    }

    /**
     * Encode string in UTF-8.
     */
    public final static byte[] encode ( String s){

	if ( null == s)
	    return null;
	else {

	    bbuf bytbuf = encode(s.toCharArray(),null);

	    if ( 0 < bytbuf.length())

		return bytbuf.toByteArray();
	    else
		return null;
	}
    }

    /**
     * Add null padding to paddedlen if necessary.
     */
    public final static byte[] encode ( String s, int paddedlen){

	if ( null == s)
	    return null;
	else {
	    bbuf bytbuf = encode(s.toCharArray(),null);

	    int bblen = bytbuf.length();

	    int delta = paddedlen- bblen;

	    if ( 0 < delta)
		bytbuf.nwrite( (byte)0, delta);

	    return bytbuf.toByteArray();
	}
    }

    /**
     * Returns the length of the string encoded in UTF-8.
     */
    public final static int encoded ( String s){

	if ( null == s)
	    return 0;
	else
	    return encoded(s.toCharArray());
    }

    /**
     * Returns the length of the string encoded in UTF-8.
     */
    public final static int encoded( char[] str){

	if ( null == str || 0 >= str.length) return 0;

	int bytlen = 0;

	char ch, sch;

	for ( int c = 0; c < str.length; c++){

	    ch = str[c];

	    if (  0x7f >= ch)

		bytlen++;

	    else if ( 0x7ff >= ch)

		bytlen += 2;

	    else 
		bytlen += 3;

	}

	return bytlen;
    }

    /**
     * The ubiquitous exclusive- or hash.  XORs each byte into the
     * long integer from ascending input bits order into ascending
     * hash bits order.
     *
     * <p> The hashing order reverses binary numeric input in hashing
     * it: the big- endian byte zero of binary numeric input would be
     * a high byte.
     * 
     * <pre>
     * |     long hash: 8 bytes, 64 bits     |
     * | high                            low |
     *
     * b[7] b[6] b[5] b[4] b[3] b[2] b[1] b[0] 
     *   
     * b[f] b[e] b[d] b[c] b[b] b[a] b[9] b[8] 
     * 
     *  ...
     *
     * </pre>
     * 
     * @param b Binary input to hash.
     * 
     * @exception IllegalArgumentException For a null argument.
     */
    public final static long xor64 ( byte[] b){
	if ( null == b) throw new IllegalArgumentException("Null argument for hash function.");

	long accum = 0, tmp;

	int shift ;

	for ( int c = 0, uc = b.length- 1; c < b.length; c++, uc--){

	    shift = ((uc % 8) * 8);

	    tmp = ((long)b[c] << shift);

	    accum ^= tmp;
	}
	return accum;
    }

    /**
     * Hash the ASCII string (the low byte of wide character values).
     * 
     * @param str String to hash
     * 
     * @exception IllegalArgumentException For null argument.
     */
    public final static long xor64_ascii ( String str){
	if ( null == str) throw new IllegalArgumentException("Null argument for hash function.");

	int strlen = str.length();

	byte[] bb = new byte[strlen];

	str.getBytes(0,strlen,bb,0);

	return xor64(bb);
    }

    /**
     * Hash the UTF-8 string.
     * 
     * @param str String encoded into UTF-8, then hashed.
     * 
     * @exception IllegalArgumentException For null argument or empty
     * string.
     */
    public final static long xor64 ( String str){

	return xor64( encode(str));
    }

    /**
     * Unique file path and state hash: a file will always have the
     * same hash until it is modified.
     * 
     * <p> Hash of path and last modified date as used for the server
     * response "ETag" header in "HTTP/1.1".
     *
     * <p> The file's "path" is the String value with which the object
     * was constructed, not necessarily its absolute path or canonical
     * path.  Its hash is the xor- fold of the low bytes of the string
     * as ASCII characters.  Foregoing UTF encoding for all paths,
     * using only the low ASCII bytes, is seen as a "good enough"
     * solution for performance reasons: it will not suffer any loss
     * of consistency.
     *
     * <p> Many users (both clients and servers) will construct their
     * file objects using their absolute path for many reasons, and so
     * this function hashes only the user's explicit input.
     * 
     * @param f File path is hashed, then XOR'ed with its last
     * modified timestamp (in milliseconds since Jan 1 1970 UT).
     * 
     * @exception IllegalArgumentException For null argument.
     */
    public final static String ETag ( java.io.File f){
	if ( null == f) throw new IllegalArgumentException("Null file argument to ETag function.");

	String pathname = f.getPath();

	long dirhash = xor64_ascii(pathname);

	long dirtims = f.lastModified();

	return Long.toHexString(dirhash ^ dirtims);
    }

}