/*
* `gnu.iou'
* Copyright (C) 2006 John Pritchard.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
* 02111-1307 USA
*/
package gnu.iou ;
/**
* <p> This class contains static tools for doing UTF-8 encoding and
* decoding.</p>
*
* <p> UTF-8 is ASCII- transparent. It supports character sets
* requiring more than the seven bit ASCII base range of UTF-8,
* including Unicode, ISO-8859, ISO-10646, etc..</p>
*
* <p> We do not use an ISO UCS code signature, and we do not use a
* Java Data I/O- style strlen prefix. </p>
*
* @author John Pritchard (jdp@syntelos)
*/
public abstract class utf8 {
/**
* Decode UTF-8 input, terminates decoding at a null character,
* value 0x0.
*
* @exception IllegalStateException Bad format.
*/
public final static char[] decode( byte[] code){
if ( null == code) return null;
return decode(code,0,code.length);
}
/**
* Decode UTF-8 input, terminates decoding at a null character,
* value 0x0.
*
* @exception IllegalStateException Bad format.
*/
public final static char[] decode( byte[] code, int off, int many){
if ( null == code || 0 >= code.length)
return null;
else {
chbuf strbuf = new chbuf(code.length);
int trm = (off+many);
int ch, ch2, ch3;
char tmpc;
for ( int cc = off; cc < trm; ){
ch = (code[cc]&0xff);
switch (ch >> 4) {
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
case 6:
case 7:
cc += 1;
tmpc = (char)ch; // for debugging
strbuf.append(tmpc);
break;
case 12:
case 13:
cc += 2;
if (cc > trm)
throw new IllegalStateException();
else {
ch2 = (int) (code[cc-1]&0xff);
if (0x80 != (ch2 & 0xC0))
throw new IllegalStateException();
else {
tmpc = (char)(((ch & 0x1F) <<6)|(ch2 & 0x3F));
strbuf.append(tmpc);
}
}
break;
case 14:
cc += 3;
if (cc > trm)
throw new IllegalStateException();
else {
ch2 = (code[cc-2]&0xff);
ch3 = (code[cc-1]&0xff);
if ((0x80 != (ch2 & 0xC0)) || (0x80 != (ch3 & 0xC0)))
throw new IllegalStateException();
else {
tmpc = (char)(((ch & 0x0F) << 12)|
((ch2 & 0x3F) << 6) |
((ch3 & 0x3F) << 0));
strbuf.append(tmpc);
}
}
break;
default:
throw new IllegalStateException();
}
}
return strbuf.toCary();
}
}
/**
* Encode string in UTF-8.
*/
public final static byte[] encode( char[] str){
if ( null == str || 0 >= str.length) return null;
bbuf bytbuf = encode( str, null);
return bytbuf.toByteArray();
}
/**
* Encode string in UTF-8.
*/
public final static bbuf encode( char[] str, bbuf bytbuf){
if ( null == bytbuf) bytbuf = new bbuf( str.length);
if ( null == str || 0 >= str.length) return bytbuf;
char ch, sch;
for ( int cc = 0, len = str.length; cc < len; cc++){
ch = str[cc];
if ((0x0 < ch) && (0x80 > ch))
bytbuf.write(ch);
else if (0x07FF < ch){
bytbuf.write(0xE0 | ((ch >> 12) & 0x0F));
bytbuf.write(0x80 | ((ch >> 6) & 0x3F));
bytbuf.write(0x80 | (ch & 0x3F));
}
else {
bytbuf.write(0xC0 | ((ch >> 6) & 0x1F));
bytbuf.write(0x80 | (ch & 0x3F));
}
}
return bytbuf;
}
/**
* Encode string in UTF-8.
*/
public final static byte[] encode ( String s){
if ( null == s)
return null;
else {
bbuf bytbuf = encode(s.toCharArray(),null);
if ( 0 < bytbuf.length())
return bytbuf.toByteArray();
else
return null;
}
}
/**
* Add null padding to paddedlen if necessary.
*/
public final static byte[] encode ( String s, int paddedlen){
if ( null == s)
return null;
else {
bbuf bytbuf = encode(s.toCharArray(),null);
int bblen = bytbuf.length();
int delta = paddedlen- bblen;
if ( 0 < delta)
bytbuf.nwrite( (byte)0, delta);
return bytbuf.toByteArray();
}
}
/**
* Returns the length of the string encoded in UTF-8.
*/
public final static int encoded ( String s){
if ( null == s)
return 0;
else
return encoded(s.toCharArray());
}
/**
* Returns the length of the string encoded in UTF-8.
*/
public final static int encoded( char[] str){
if ( null == str || 0 >= str.length) return 0;
int bytlen = 0;
char ch, sch;
for ( int c = 0; c < str.length; c++){
ch = str[c];
if ( 0x7f >= ch)
bytlen++;
else if ( 0x7ff >= ch)
bytlen += 2;
else
bytlen += 3;
}
return bytlen;
}
/**
* The ubiquitous exclusive- or hash. XORs each byte into the
* long integer from ascending input bits order into ascending
* hash bits order.
*
* <p> The hashing order reverses binary numeric input in hashing
* it: the big- endian byte zero of binary numeric input would be
* a high byte.
*
* <pre>
* | long hash: 8 bytes, 64 bits |
* | high low |
*
* b[7] b[6] b[5] b[4] b[3] b[2] b[1] b[0]
*
* b[f] b[e] b[d] b[c] b[b] b[a] b[9] b[8]
*
* ...
*
* </pre>
*
* @param b Binary input to hash.
*
* @exception IllegalArgumentException For a null argument.
*/
public final static long xor64 ( byte[] b){
if ( null == b) throw new IllegalArgumentException("Null argument for hash function.");
long accum = 0, tmp;
int shift ;
for ( int c = 0, uc = b.length- 1; c < b.length; c++, uc--){
shift = ((uc % 8) * 8);
tmp = ((long)b[c] << shift);
accum ^= tmp;
}
return accum;
}
/**
* Hash the ASCII string (the low byte of wide character values).
*
* @param str String to hash
*
* @exception IllegalArgumentException For null argument.
*/
public final static long xor64_ascii ( String str){
if ( null == str) throw new IllegalArgumentException("Null argument for hash function.");
int strlen = str.length();
byte[] bb = new byte[strlen];
str.getBytes(0,strlen,bb,0);
return xor64(bb);
}
/**
* Hash the UTF-8 string.
*
* @param str String encoded into UTF-8, then hashed.
*
* @exception IllegalArgumentException For null argument or empty
* string.
*/
public final static long xor64 ( String str){
return xor64( encode(str));
}
/**
* Unique file path and state hash: a file will always have the
* same hash until it is modified.
*
* <p> Hash of path and last modified date as used for the server
* response "ETag" header in "HTTP/1.1".
*
* <p> The file's "path" is the String value with which the object
* was constructed, not necessarily its absolute path or canonical
* path. Its hash is the xor- fold of the low bytes of the string
* as ASCII characters. Foregoing UTF encoding for all paths,
* using only the low ASCII bytes, is seen as a "good enough"
* solution for performance reasons: it will not suffer any loss
* of consistency.
*
* <p> Many users (both clients and servers) will construct their
* file objects using their absolute path for many reasons, and so
* this function hashes only the user's explicit input.
*
* @param f File path is hashed, then XOR'ed with its last
* modified timestamp (in milliseconds since Jan 1 1970 UT).
*
* @exception IllegalArgumentException For null argument.
*/
public final static String ETag ( java.io.File f){
if ( null == f) throw new IllegalArgumentException("Null file argument to ETag function.");
String pathname = f.getPath();
long dirhash = xor64_ascii(pathname);
long dirtims = f.lastModified();
return Long.toHexString(dirhash ^ dirtims);
}
}