UTF8.java example

Explorer
eXist-1.4.x-master
/* 
 *  `gnu.iou' I/O buffers and utilities.
 *  Copyright (C) 1998, 1999, 2000, 2001, 2002 John Pritchard.
 *
 *  This program is free software; you can redistribute it or modify
 *  it under the terms of the GNU Lesser General Public License as
 *  published by the Free Software Foundation; either version 2.1 of
 *  the License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this program; if not, write to the Free
 *  Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
 *  02111-1307 USA
 */

package org.exist.util;

/**
 * This class contains two static tools for doing UTF-8 encoding and
 * decoding.  
 *
 * <p> UTF-8 is ASCII- transparent.  It supports character sets
 * requiring more than the seven bit ASCII base range of UTF-8,
 * including Unicode, ISO-8859, ISO-10646, etc..
 * 
 * <p> We do not use an ISO UCS code signature, and we do not use a
 * Java Data I/O- style strlen prefix.  
 *
 * @author John Pritchard (john@syntelos.org)
 */
public class UTF8 {

	/**
	 * Decode UTF-8 input, terminates decoding at a null character,
	 * value 0x0.
	 * 
	 * @exception IllegalStateException Bad format.
	 */
	public final static XMLString decode(byte[] code) {

		if (null == code)
			return null;

		return decode(code, 0, code.length);
	}
    
    public final static XMLString decode(byte[] code, int off, int many) {
        if (null == code || 0 >= code.length)
            return null;

        XMLString xs = new XMLString(many);
        return decode(code, off, many, xs);
    }
    
	/**
	 * Decode UTF-8 input, terminates decoding at a null character,
	 * value 0x0.
	 * 
	 * @exception IllegalStateException Bad format.
	 */
	public final static XMLString decode(byte[] code, int off, int many, XMLString xs) {

		if (null == code || 0 >= code.length)
			return null;

		char ch;

		int end = (off + many);

		byte cc;

		for (int c = off; c < end; c++) {

			cc = code[c];

			if (0 <= cc) {

				xs.append((char) cc);
			} else if (0 == cc) {
				return xs;
			} else {
				ch = 0;

				if (b11000000 == (cc & b11100000)) {

					ch |= (code[c + 1] & b00111111);
					ch |= (cc & b00011111) << 6;

					c += 1;
				} else if (b11100000 == (cc & b11110000)) {

					ch |= (code[c + 2] & b00111111);
					ch |= (code[c + 1] & b00111111) << 6;
					ch |= (cc & b00001111) << 12;

					c += 2;
				} else if (b11110000 == (cc & b11111000)) {

					ch |= (code[c + 3] & b00111111);
					ch |= (code[c + 2] & b00111111) << 6;
					ch |= (code[c + 1] & b00111111) << 12;

					c += 3;
				} else if (b11111000 == (cc & b11111100)) {

					ch |= (code[c + 4] & b00111111);
					ch |= (code[c + 3] & b00111111) << 6;
					ch |= (code[c + 2] & b00111111) << 12;

					c += 4;
				} else if (b11111100 == (cc & b11111110)) {

					ch |= (code[c + 5] & b00111111);
					ch |= (code[c + 4] & b00111111) << 6;
					ch |= (code[c + 3] & b00111111) << 12;

					c += 5;
				} else {
					ch = (char) (cc & b01111111); // 0x7f

				}
				xs.append(ch);

			} // else // if ( 0 < cc)
		}

		return xs;
	}

	/**
	 * Encode string in UTF-8.
	 */
	public final static byte[] encode(char[] str) {

		if (null == str || 0 >= str.length)
			return null;

		return encode(str, 0, str.length, null, 0);
	}

	/**
	 * Encode string in UTF-8.
	 * 
	 * Warning: the size of bytbuf is not checked. Use encoded() to determine
	 * the size needed.
	 */
	public final static byte[] encode(
		char[] str,
		int start,
		int length,
		byte[] bytbuf,
		int offset) {

		if (null == str || 0 >= length)
			return bytbuf;

		if (bytbuf == null)
			bytbuf = new byte[encoded(str, start, length)];

		char ch, sch;
		int end = start + length;
		for (int c = start; c < end; c++) {

			ch = str[c];

			if (0x7f >= ch) {
				bytbuf[offset++] = (byte) ch;
			} else if (0x7ff >= ch) {

				sch = (char) (ch >>> 6);

				if (0 < sch) {
					bytbuf[offset++] = (byte) (b11000000 | (sch & b00011111));
				} else
					bytbuf[offset++] = (byte) (b11000000);

				bytbuf[offset++] = (byte) (b10000000 | (ch & b00111111));
			} else {

				sch = (char) (ch >>> 12);

				if (0 < sch) {

					bytbuf[offset++] = (byte) (b11100000 | (sch & b00001111));
				} else
					bytbuf[offset++] = (byte) (b11100000);

				bytbuf[offset++] = (byte) (b10000000 | ((ch >>> 6) & b00111111));

				bytbuf[offset++] = (byte) (b10000000 | (ch & b00111111));
			}
		}

		return bytbuf;
	}

	public final static byte[] encode(String str, byte[] bytbuf, int offset) {
		return encode(str, 0, str.length(), bytbuf, offset);
	}

	/**
		 * Encode string in UTF-8.
		 * 
		 * Warning: the size of bytbuf is not checked. Use encoded() to determine
		 * the size needed.
		 */
	public final static byte[] encode(
		String str,
		int start,
		int length,
		byte[] bytbuf,
		int offset) {

		if (null == str || 0 >= length)
			return bytbuf;

		char ch, sch;
		int end = start + length;
		for (int c = start; c < end; c++) {

			ch = str.charAt(c);

			if (0x7f >= ch) {
				bytbuf[offset++] = (byte) ch;
			} else if (0x7ff >= ch) {

				sch = (char) (ch >>> 6);

				if (0 < sch) {
					bytbuf[offset++] = (byte) (b11000000 | (sch & b00011111));
				} else
					bytbuf[offset++] = (byte) (b11000000);

				bytbuf[offset++] = (byte) (b10000000 | (ch & b00111111));
			} else {

				sch = (char) (ch >>> 12);

				if (0 < sch) {

					bytbuf[offset++] = (byte) (b11100000 | (sch & b00001111));
				} else
					bytbuf[offset++] = (byte) (b11100000);

				bytbuf[offset++] = (byte) (b10000000 | ((ch >>> 6) & b00111111));

				bytbuf[offset++] = (byte) (b10000000 | (ch & b00111111));
			}
		}

		return bytbuf;
	}

	/**
	 * Encode string in UTF-8.
	 */
	public final static byte[] encode(String s) {

		if (null == s)
			return null;
		else {

			return encode(s.toCharArray(), 0, s.length(), null, 0);
		}
	}

	private final static char b10000000 = (char) 0x80;
	private final static char b11000000 = (char) 0xC0;
	private final static char b11100000 = (char) 0xE0;
	private final static char b11110000 = (char) 0xF0;
	private final static char b11111000 = (char) 0xF8;
	private final static char b11111100 = (char) 0xFC;
	private final static char b11111110 = (char) 0xFE;

	private final static char b01111111 = (char) 0x7F;
	private final static char b00111111 = (char) 0x3F;
	private final static char b00011111 = (char) 0x1F;
	private final static char b00001111 = (char) 0x0F;
	//private final static char b00000111 = (char) 0x07;
	//private final static char b00000011 = (char) 0x03;
	//private final static char b00000001 = (char) 0x01;

	/**
	 * Returns the length of the string encoded in UTF-8.
	 */
	public final static int encoded(String str) {

		if (null == str)
			return 0;

		int bytlen = 0;

		char ch;
		//char sch;
		for (int c = 0; c < str.length(); c++) {

			ch = str.charAt(c);

			if (0x7f >= ch)
				bytlen++;

			else if (0x7ff >= ch)
				bytlen += 2;

			else
				bytlen += 3;

		}

		return bytlen;
	}
	
	/**
	 * Returns the length of the string encoded in UTF-8.
	 */
	public final static int encoded(char[] str, int start, int len) {

		if (null == str || 0 >= len)
			return 0;

		int bytlen = 0;

		char ch;
		//char sch;
		int end = start + len;
		for (int c = start; c < end; c++) {

			ch = str[c];

			if (0x7f >= ch)
				bytlen++;

			else if (0x7ff >= ch)
				bytlen += 2;

			else
				bytlen += 3;

		}

		return bytlen;
	}
    
    /**
     * Static method to generate the UTF-8 representation of a Unicode character.
     * This particular code is taken from saxon (see http://saxon.sf.net).
     * 
     * @param in the Unicode character, or the high half of a surrogate pair
     * @param in2 the low half of a surrogate pair (ignored unless the first argument is in the
     * range for a surrogate pair)
     * @param out an array of at least 4 bytes to hold the UTF-8 representation.
     * @return the number of bytes in the UTF-8 representation
     */
     public static int getUTF8Encoding(char in, char in2, byte[] out) {
         // See Tony Graham, "Unicode, a Primer", page 92
         int i = (int)in;
         if (i<=0x7f) {
             out[0] = (byte)i;
             return 1;
         } else if (i<=0x7ff) {
             out[0] = (byte)(0xc0 | ((in >> 6) & 0x1f));
             out[1] = (byte)(0x80 | (in & 0x3f));
             return 2;
         } else if (i>=0xd800 && i<=0xdbff) {
             // surrogate pair
             int j = (int)in2;
             if (!(j>=0xdc00 && j<=0xdfff)) {
                 throw new IllegalArgumentException("Malformed Unicode Surrogate Pair (" + i + "," + j + ")");
             }
             byte xxxxxx = (byte)(j & 0x3f);
             byte yyyyyy = (byte)(((i & 0x03) << 4) | ((j >> 6) & 0x0f));
             byte zzzz = (byte)((i >> 2) & 0x0f);
             byte uuuuu = (byte)(((i >> 6) & 0x0f) + 1);
             out[0] = (byte)(0xf0 | ((uuuuu >> 2) & 0x07));
             out[1] = (byte)(0x80 | ((uuuuu & 0x03) << 4) | zzzz);
             out[2] = (byte)(0x80 | yyyyyy);
             out[3] = (byte)(0x80 | xxxxxx);
             return 4;
         } else if (i>=0xdc00 && i<=0xdfff) {
             // second half of surrogate pair - ignore it
             return 0;
         } else {
             out[0] = (byte)(0xe0 | ((in >> 12) & 0x0f));
             out[1] = (byte)(0x80 | ((in >> 6) & 0x3f));
             out[2] = (byte)(0x80 | (in & 0x3f));
             return 3;
         }
     }

}