/*
* Copyright (c) 1998-2011 Caucho Technology -- all rights reserved
*
* This file is part of Resin(R) Open Source
*
* Each copy or derived work must preserve the copyright notice and this
* notice unmodified.
*
* Resin Open Source is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* Resin Open Source is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
* of NON-INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with Resin Open Source; if not, write to the
*
* Free Software Foundation, Inc.
* 59 Temple Place, Suite 330
* Boston, MA 02111-1307 USA
*
* @author Nam Nguyen
*/
package com.caucho.quercus.lib.i18n;
import java.util.logging.Logger;
import com.caucho.quercus.env.Env;
import com.caucho.quercus.env.StringValue;
import com.caucho.util.L10N;
public class Utf8Decoder
extends Decoder
{
private static final Logger log
= Logger.getLogger(Utf8Decoder.class.getName());
private static final L10N L = new L10N(Utf8Decoder.class);
private static final int ERROR_CHARACTER = 0xFFFE;
private static final int EOF = -1;
public Utf8Decoder(String charset)
{
super(charset);
}
public boolean isUtf8()
{
return true;
}
@Override
public boolean isDecodable(Env env, StringValue str)
{
if (str.isUnicode())
return true;
Utf8Reader reader = new Utf8Reader(str);
int ch;
while ((ch = reader.read()) >= 0) {
if (ch == ERROR_CHARACTER)
return false;
}
return true;
}
@Override
protected StringBuilder decodeImpl(Env env, StringValue str)
{
StringBuilder sb = new StringBuilder();
int len = str.length();
for (int i = 0; i < len; i++) {
int ch = str.charAt(i);
if (ch <= 0x7F)
sb.append((char) ch);
else if (0xC2 <= ch && ch <= 0xDF) {
int ch2;
if (i + 1 < len
&& 0x80 <= (ch2 = str.charAt(i + 1)) && ch2 <= 0xBF) {
i++;
int code = ((ch - 0xC0) << 6) + (ch2 - 0x80);
sb.append((char) code);
}
else if (_isIgnoreErrors) {
}
else if (_replacement != null)
sb.append(_replacement);
else if (_isAllowMalformedOut)
sb.append((char) ch);
else
return sb;
}
else if (0xE0 <= ch && ch <= 0xEF) {
int ch2;
int ch3;
if (i + 2 < len
&& 0x80 <= (ch2 = str.charAt(i + 1)) && ch2 <= 0xBF
&& 0x80 <= (ch3 = str.charAt(i + 2)) && ch3 <= 0xBF) {
i += 2;
int code = ((ch - 0xE0) << 12)
+ ((ch2 - 0x80) << 6)
+ (ch3 - 0x80);
if (0xD800 <= code && code <= 0xDBFF) {
code &= 0xFFFFF;
int high = 0xD800 + (code >> 10);
int low = 0xDC00 + (code & 0x3FF);
sb.append((char) high);
sb.append((char) low);
}
else
sb.append((char) code);
}
else if (_isIgnoreErrors) {
}
else if (_replacement != null)
sb.append(_replacement);
else if (_isAllowMalformedOut)
sb.append((char) ch);
else
return sb;
}
else if (0xF0 <= ch && ch <= 0xF4) {
int ch2;
int ch3;
int ch4;
if (i + 3 < len
&& 0x80 <= (ch2 = str.charAt(i + 1)) && ch2 <= 0xBF
&& 0x80 <= (ch3 = str.charAt(i + 2)) && ch3 <= 0xBF
&& 0x80 <= (ch4 = str.charAt(i + 3)) && ch4 <= 0xBF) {
i += 3;
int code = ((ch - 0xF0) << 18)
+ ((ch2 - 0x80) << 12)
+ ((ch3 - 0x80) << 6)
+ (ch4 - 0x80);
if (code > 0xFFFF || 0xD800 <= code && code <= 0xDBFF) {
code &= 0xFFFFF;
int high = 0xD800 + code >> 10;
int low = 0xDC00 + code & 0x3FF;
sb.append((char) high);
sb.append((char) low);
}
else
sb.append((char) code);
}
else if (_isIgnoreErrors) {
}
else if (_replacement != null)
sb.append(_replacement);
else if (_isAllowMalformedOut)
sb.append((char) ch);
else
return sb;
}
else if (_isIgnoreErrors) {
}
else if (_replacement != null)
sb.append(_replacement);
else if (_isAllowMalformedOut)
sb.append((char) ch);
else
return sb;
}
/*
Utf8Reader reader = new Utf8Reader(str);
int ch;
while ((ch = reader.read()) >= 0) {
if (ch == ERROR_CHARACTER) {
_hasError = true;
if (_isIgnoreErrors) {
}
else if (_replacement != null)
sb.append(_replacement);
else
return sb;
}
else
sb.append((char) ch);
}
*/
return sb;
}
private static void decodeCodePoint(StringBuilder sb, int code)
{
code &= 0xFFFFF;
int high = 0xD800 + code >> 10;
int low = 0xDC00 + code & 0x3FF;
sb.append((char) high);
sb.append((char) low);
}
static class Utf8Reader
{
int _peek = -1;
int _index;
final int _len;
StringValue _str;
public Utf8Reader(StringValue str)
{
_str = str;
_len = str.length();
}
public int read()
{
int ch1;
if (_peek >= 0) {
ch1 = _peek;
_peek = -1;
}
else
ch1 = readByte();
if (ch1 < 0x80) {
return ch1;
}
if ((ch1 & 0xe0) == 0xc0) {
int ch2 = readByte();
if (ch2 < 0)
return ERROR_CHARACTER;
else if ((ch2 & 0xc0) != 0x80) {
unread();
return ERROR_CHARACTER;
}
return ((ch1 & 0x1f) << 6) + (ch2 & 0x3f);
}
else if ((ch1 & 0xf0) == 0xe0) {
int ch2 = readByte();
if (ch2 < 0)
return ERROR_CHARACTER;
else if ((ch2 & 0xc0) != 0x80) {
unread();
return ERROR_CHARACTER;
}
int ch3 = readByte();
if (ch3 < 0) {
unread();
return ERROR_CHARACTER;
}
else if ((ch3 & 0xc0) != 0x80) {
unread();
unread();
return ERROR_CHARACTER;
}
int ch = ((ch1 & 0x1f) << 12) + ((ch2 & 0x3f) << 6) + (ch3 & 0x3f);
if (ch == 0xfeff) // handle some writers, e.g. microsoft
return readByte();
else
return ch;
}
else if ((ch1 & 0xf0) == 0xf0) {
int ch2 = readByte();
if (ch2 < 0)
return ERROR_CHARACTER;
else if ((ch2 & 0xc0) != 0x80) {
unread();
return ERROR_CHARACTER;
}
int ch3 = readByte();
if (ch3 < 0) {
unread();
return ERROR_CHARACTER;
}
else if ((ch3 & 0xc0) != 0x80) {
unread();
unread();
return ERROR_CHARACTER;
}
int ch4 = readByte();
if (ch4 < 0) {
unread();
unread();
return ERROR_CHARACTER;
}
else if ((ch4 & 0xc0) != 0x80) {
unread();
unread();
unread();
return ERROR_CHARACTER;
}
int ch = (((ch1 & 0xf) << 18)
+ ((ch2 & 0x3f) << 12)
+ ((ch3 & 0x3f) << 6)
+ ((ch4 & 0x3f)));
_peek = 0xdc00 + (ch & 0x3ff);
return 0xd800 + ((ch - 0x10000) / 0x400);
}
else
return ERROR_CHARACTER;
}
private int readByte()
{
if (_index < _len)
return _str.charAt(_index++);
else
return EOF;
}
private void unread()
{
_index--;
}
}
}