/* * `gnu.iou' I/O buffers and utilities. * Copyright (C) 1998, 1999, 2000, 2001, 2002 John Pritchard. * * This program is free software; you can redistribute it or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA * 02111-1307 USA */ package org.exist.util; /** * This class contains two static tools for doing UTF-8 encoding and * decoding. * * <p> UTF-8 is ASCII- transparent. It supports character sets * requiring more than the seven bit ASCII base range of UTF-8, * including Unicode, ISO-8859, ISO-10646, etc.. * * <p> We do not use an ISO UCS code signature, and we do not use a * Java Data I/O- style strlen prefix. * * @author John Pritchard (john@syntelos.org) */ public class UTF8 { /** * Decode UTF-8 input, terminates decoding at a null character, * value 0x0. * * @exception IllegalStateException Bad format. */ public final static XMLString decode(byte[] code) { if (null == code) return null; return decode(code, 0, code.length); } public final static XMLString decode(byte[] code, int off, int many) { if (null == code || 0 >= code.length) return null; XMLString xs = new XMLString(many); return decode(code, off, many, xs); } /** * Decode UTF-8 input, terminates decoding at a null character, * value 0x0. * * @exception IllegalStateException Bad format. */ public final static XMLString decode(byte[] code, int off, int many, XMLString xs) { if (null == code || 0 >= code.length) return null; char ch; int end = (off + many); byte cc; for (int c = off; c < end; c++) { cc = code[c]; if (0 <= cc) { xs.append((char) cc); } else if (0 == cc) { return xs; } else { ch = 0; if (b11000000 == (cc & b11100000)) { ch |= (code[c + 1] & b00111111); ch |= (cc & b00011111) << 6; c += 1; } else if (b11100000 == (cc & b11110000)) { ch |= (code[c + 2] & b00111111); ch |= (code[c + 1] & b00111111) << 6; ch |= (cc & b00001111) << 12; c += 2; } else if (b11110000 == (cc & b11111000)) { ch |= (code[c + 3] & b00111111); ch |= (code[c + 2] & b00111111) << 6; ch |= (code[c + 1] & b00111111) << 12; c += 3; } else if (b11111000 == (cc & b11111100)) { ch |= (code[c + 4] & b00111111); ch |= (code[c + 3] & b00111111) << 6; ch |= (code[c + 2] & b00111111) << 12; c += 4; } else if (b11111100 == (cc & b11111110)) { ch |= (code[c + 5] & b00111111); ch |= (code[c + 4] & b00111111) << 6; ch |= (code[c + 3] & b00111111) << 12; c += 5; } else { ch = (char) (cc & b01111111); // 0x7f } xs.append(ch); } // else // if ( 0 < cc) } return xs; } /** * Encode string in UTF-8. */ public final static byte[] encode(char[] str) { if (null == str || 0 >= str.length) return null; return encode(str, 0, str.length, null, 0); } /** * Encode string in UTF-8. * * Warning: the size of bytbuf is not checked. Use encoded() to determine * the size needed. */ public final static byte[] encode( char[] str, int start, int length, byte[] bytbuf, int offset) { if (null == str || 0 >= length) return bytbuf; if (bytbuf == null) bytbuf = new byte[encoded(str, start, length)]; char ch, sch; int end = start + length; for (int c = start; c < end; c++) { ch = str[c]; if (0x7f >= ch) { bytbuf[offset++] = (byte) ch; } else if (0x7ff >= ch) { sch = (char) (ch >>> 6); if (0 < sch) { bytbuf[offset++] = (byte) (b11000000 | (sch & b00011111)); } else bytbuf[offset++] = (byte) (b11000000); bytbuf[offset++] = (byte) (b10000000 | (ch & b00111111)); } else { sch = (char) (ch >>> 12); if (0 < sch) { bytbuf[offset++] = (byte) (b11100000 | (sch & b00001111)); } else bytbuf[offset++] = (byte) (b11100000); bytbuf[offset++] = (byte) (b10000000 | ((ch >>> 6) & b00111111)); bytbuf[offset++] = (byte) (b10000000 | (ch & b00111111)); } } return bytbuf; } public final static byte[] encode(String str, byte[] bytbuf, int offset) { return encode(str, 0, str.length(), bytbuf, offset); } /** * Encode string in UTF-8. * * Warning: the size of bytbuf is not checked. Use encoded() to determine * the size needed. */ public final static byte[] encode( String str, int start, int length, byte[] bytbuf, int offset) { if (null == str || 0 >= length) return bytbuf; char ch, sch; int end = start + length; for (int c = start; c < end; c++) { ch = str.charAt(c); if (0x7f >= ch) { bytbuf[offset++] = (byte) ch; } else if (0x7ff >= ch) { sch = (char) (ch >>> 6); if (0 < sch) { bytbuf[offset++] = (byte) (b11000000 | (sch & b00011111)); } else bytbuf[offset++] = (byte) (b11000000); bytbuf[offset++] = (byte) (b10000000 | (ch & b00111111)); } else { sch = (char) (ch >>> 12); if (0 < sch) { bytbuf[offset++] = (byte) (b11100000 | (sch & b00001111)); } else bytbuf[offset++] = (byte) (b11100000); bytbuf[offset++] = (byte) (b10000000 | ((ch >>> 6) & b00111111)); bytbuf[offset++] = (byte) (b10000000 | (ch & b00111111)); } } return bytbuf; } /** * Encode string in UTF-8. */ public final static byte[] encode(String s) { if (null == s) return null; else { return encode(s.toCharArray(), 0, s.length(), null, 0); } } private final static char b10000000 = (char) 0x80; private final static char b11000000 = (char) 0xC0; private final static char b11100000 = (char) 0xE0; private final static char b11110000 = (char) 0xF0; private final static char b11111000 = (char) 0xF8; private final static char b11111100 = (char) 0xFC; private final static char b11111110 = (char) 0xFE; private final static char b01111111 = (char) 0x7F; private final static char b00111111 = (char) 0x3F; private final static char b00011111 = (char) 0x1F; private final static char b00001111 = (char) 0x0F; //private final static char b00000111 = (char) 0x07; //private final static char b00000011 = (char) 0x03; //private final static char b00000001 = (char) 0x01; /** * Returns the length of the string encoded in UTF-8. */ public final static int encoded(String str) { if (null == str) return 0; int bytlen = 0; char ch; //char sch; for (int c = 0; c < str.length(); c++) { ch = str.charAt(c); if (0x7f >= ch) bytlen++; else if (0x7ff >= ch) bytlen += 2; else bytlen += 3; } return bytlen; } /** * Returns the length of the string encoded in UTF-8. */ public final static int encoded(char[] str, int start, int len) { if (null == str || 0 >= len) return 0; int bytlen = 0; char ch; //char sch; int end = start + len; for (int c = start; c < end; c++) { ch = str[c]; if (0x7f >= ch) bytlen++; else if (0x7ff >= ch) bytlen += 2; else bytlen += 3; } return bytlen; } /** * Static method to generate the UTF-8 representation of a Unicode character. * This particular code is taken from saxon (see http://saxon.sf.net). * * @param in the Unicode character, or the high half of a surrogate pair * @param in2 the low half of a surrogate pair (ignored unless the first argument is in the * range for a surrogate pair) * @param out an array of at least 4 bytes to hold the UTF-8 representation. * @return the number of bytes in the UTF-8 representation */ public static int getUTF8Encoding(char in, char in2, byte[] out) { // See Tony Graham, "Unicode, a Primer", page 92 int i = (int)in; if (i<=0x7f) { out[0] = (byte)i; return 1; } else if (i<=0x7ff) { out[0] = (byte)(0xc0 | ((in >> 6) & 0x1f)); out[1] = (byte)(0x80 | (in & 0x3f)); return 2; } else if (i>=0xd800 && i<=0xdbff) { // surrogate pair int j = (int)in2; if (!(j>=0xdc00 && j<=0xdfff)) { throw new IllegalArgumentException("Malformed Unicode Surrogate Pair (" + i + "," + j + ")"); } byte xxxxxx = (byte)(j & 0x3f); byte yyyyyy = (byte)(((i & 0x03) << 4) | ((j >> 6) & 0x0f)); byte zzzz = (byte)((i >> 2) & 0x0f); byte uuuuu = (byte)(((i >> 6) & 0x0f) + 1); out[0] = (byte)(0xf0 | ((uuuuu >> 2) & 0x07)); out[1] = (byte)(0x80 | ((uuuuu & 0x03) << 4) | zzzz); out[2] = (byte)(0x80 | yyyyyy); out[3] = (byte)(0x80 | xxxxxx); return 4; } else if (i>=0xdc00 && i<=0xdfff) { // second half of surrogate pair - ignore it return 0; } else { out[0] = (byte)(0xe0 | ((in >> 12) & 0x0f)); out[1] = (byte)(0x80 | ((in >> 6) & 0x3f)); out[2] = (byte)(0x80 | (in & 0x3f)); return 3; } } }