/***************************************************************************** * Copyright (C) 2008 EnterpriseDB Corporation. * Copyright (C) 2011 Stado Global Development Group. * * This file is part of Stado. * * Stado is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Stado is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Stado. If not, see <http://www.gnu.org/licenses/>. * * You can find Stado at http://www.stado.us * ****************************************************************************/ package org.postgresql.driver.core; import java.io.IOException; import org.postgresql.driver.util.GT; class UTF8Encoding extends Encoding { UTF8Encoding(String jvmEncoding) { super(jvmEncoding); } private static final int MIN_2_BYTES = 0x80; private static final int MIN_3_BYTES = 0x800; private static final int MIN_4_BYTES = 0x10000; private static final int MAX_CODE_POINT = 0x10ffff; private char[] decoderArray = new char[1024]; // helper for decode private final static void checkByte(int ch, int pos, int len) throws IOException { if ((ch & 0xc0) != 0x80) throw new IOException(GT.tr("Illegal UTF-8 sequence: byte {0} of {1} byte sequence is not 10xxxxxx: {2}", new Object[] { new Integer(pos), new Integer(len), new Integer(ch) })); } private final static void checkMinimal(int ch, int minValue) throws IOException { if (ch >= minValue) return; int actualLen; switch (minValue) { case MIN_2_BYTES: actualLen = 2; break; case MIN_3_BYTES: actualLen = 3; break; case MIN_4_BYTES: actualLen = 4; break; default: throw new IllegalArgumentException("unexpected minValue passed to checkMinimal: " + minValue); } int expectedLen; if (ch < MIN_2_BYTES) expectedLen = 1; else if (ch < MIN_3_BYTES) expectedLen = 2; else if (ch < MIN_4_BYTES) expectedLen = 3; else throw new IllegalArgumentException("unexpected ch passed to checkMinimal: " + ch); throw new IOException(GT.tr("Illegal UTF-8 sequence: {0} bytes used to encode a {1} byte value: {2}", new Object[] { new Integer(actualLen), new Integer(expectedLen), new Integer(ch) })); } /** * Custom byte[] -> String conversion routine for UTF-8 only. * This is about twice as fast as using the String(byte[],int,int,String) * ctor, at least under JDK 1.4.2. The extra checks for illegal representations * add about 10-15% overhead, but they seem worth it given the number of SQL_ASCII * databases out there. * * @param data the array containing UTF8-encoded data * @param offset the offset of the first byte in <code>data</code> to decode from * @param length the number of bytes to decode * @return a decoded string * @throws IOException if something goes wrong */ public synchronized String decode(byte[] data, int offset, int length) throws IOException { char[] cdata = decoderArray; if (cdata.length < length) cdata = decoderArray = new char[length]; int in = offset; int out = 0; int end = length + offset; try { while (in < end) { int ch = data[in++] & 0xff; // Convert UTF-8 to 21-bit codepoint. if (ch < 0x80) { // 0xxxxxxx -- length 1. } else if (ch < 0xc0) { // 10xxxxxx -- illegal! throw new IOException(GT.tr("Illegal UTF-8 sequence: initial byte is {0}: {1}", new Object[] { "10xxxxxx", new Integer(ch) })); } else if (ch < 0xe0) { // 110xxxxx 10xxxxxx ch = ((ch & 0x1f) << 6); checkByte(data[in], 2, 2); ch = ch | (data[in++] & 0x3f); checkMinimal(ch, MIN_2_BYTES); } else if (ch < 0xf0) { // 1110xxxx 10xxxxxx 10xxxxxx ch = ((ch & 0x0f) << 12); checkByte(data[in], 2, 3); ch = ch | ((data[in++] & 0x3f) << 6); checkByte(data[in], 3, 3); ch = ch | (data[in++] & 0x3f); checkMinimal(ch, MIN_3_BYTES); } else if (ch < 0xf8) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx ch = ((ch & 0x07) << 18); checkByte(data[in], 2, 4); ch = ch | ((data[in++] & 0x3f) << 12); checkByte(data[in], 3, 4); ch = ch | ((data[in++] & 0x3f) << 6); checkByte(data[in], 4, 4); ch = ch | (data[in++] & 0x3f); checkMinimal(ch, MIN_4_BYTES); } else { throw new IOException(GT.tr("Illegal UTF-8 sequence: initial byte is {0}: {1}", new Object[] { "11111xxx", new Integer(ch) })); } if (ch > MAX_CODE_POINT) throw new IOException(GT.tr("Illegal UTF-8 sequence: final value is out of range: {0}", new Integer(ch))); // Convert 21-bit codepoint to Java chars: // 0..ffff are represented directly as a single char // 10000..10ffff are represented as a "surrogate pair" of two chars // See: http://java.sun.com/developer/technicalArticles/Intl/Supplementary/ if (ch > 0xffff) { // Use a surrogate pair to represent it. ch -= 0x10000; // ch is now 0..fffff (20 bits) cdata[out++] = (char) (0xd800 + (ch >> 10)); // top 10 bits cdata[out++] = (char) (0xdc00 + (ch & 0x3ff)); // bottom 10 bits } else if (ch >= 0xd800 && ch < 0xe000) { // Not allowed to encode the surrogate range directly. throw new IOException(GT.tr("Illegal UTF-8 sequence: final value is a surrogate value: {0}", new Integer(ch))); } else { // Normal case. cdata[out++] = (char) ch; } } } catch (ArrayIndexOutOfBoundsException a) { throw new IOException("Illegal UTF-8 sequence: multibyte sequence was truncated"); } // Check if we ran past the end without seeing an exception. if (in > end) throw new IOException("Illegal UTF-8 sequence: multibyte sequence was truncated"); return new String(cdata, 0, out); } }