/* * Copyright (c) 1998-2011 Caucho Technology -- all rights reserved * * This file is part of Resin(R) Open Source * * Each copy or derived work must preserve the copyright notice and this * notice unmodified. * * Resin Open Source is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Resin Open Source is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty * of NON-INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with Resin Open Source; if not, write to the * * Free Software Foundation, Inc. * 59 Temple Place, Suite 330 * Boston, MA 02111-1307 USA * * @author Nam Nguyen */ package com.caucho.quercus.lib.i18n; import com.caucho.quercus.env.*; public class UnicodeUtility { public static StringValue utf8Clean(Env env, StringValue str, String replacement, boolean isIgnore) { StringValue sb = str.createStringBuilder(); int len = str.length(); for (int i = 0; i < len; i++) { char ch = str.charAt(i); if (ch <= 0x7F) sb.append(ch); else if (0xC2 <= ch && ch <= 0xDF) { char ch2; if (i + 1 < len && 0x80 <= (ch2 = str.charAt(i + 1)) && ch2 <= 0xBF) { i++; sb.append(ch); sb.append(ch2); } else if (isIgnore) { } else if (replacement != null) sb.append(replacement); else return sb; } else if (0xE0 <= ch && ch <= 0xEF) { char ch2; char ch3; if (i + 2 < len && 0x80 <= (ch2 = str.charAt(i + 1)) && ch2 <= 0xBF && 0x80 <= (ch3 = str.charAt(i + 2)) && ch3 <= 0xBF) { i += 2; sb.append(ch); sb.append(ch2); sb.append(ch3); } else if (isIgnore) { } else if (replacement != null) sb.append(replacement); else return sb; } else if (0xF0 <= ch && ch <= 0xF4) { char ch2; char ch3; char ch4; if (i + 3 < len && 0x80 <= (ch2 = str.charAt(i + 1)) && ch2 <= 0xBF && 0x80 <= (ch3 = str.charAt(i + 2)) && ch3 <= 0xBF && 0x80 <= (ch4 = str.charAt(i + 3)) && ch4 <= 0xBF) { i += 3; sb.append(ch); sb.append(ch2); sb.append(ch3); sb.append(ch4); } else if (isIgnore) { } else if (replacement != null) sb.append(replacement); else return sb; } else if (isIgnore) { } else if (replacement != null) sb.append(replacement); else return sb; } return sb; } public static CharSequence decode(Env env, StringValue str, String charset) { return decode(env, str, charset, null, false); } public static CharSequence decode(Env env, StringValue str, String charset, String replacement, boolean isIgnoreErrors) { Decoder decoder = Decoder.create(charset); decoder.setReplacement(replacement); decoder.setIgnoreErrors(isIgnoreErrors); return decoder.decode(env, str); } public static StringValue encode(Env env, CharSequence str, String charset) { return encode(env, str, charset, null, false); } public static StringValue encode(Env env, CharSequence str, String charset, String replacement, boolean isIgnoreErrors) { Encoder encoder = Encoder.create(charset); encoder.setReplacement(replacement); encoder.setIgnoreErrors(isIgnoreErrors); return encoder.encode(env, str); } public static StringValue decodeEncode(Env env, StringValue str, String inCharset, String outCharset, String replacement, boolean isIgnoreErrors) { boolean isStartUtf8 = false; boolean isEndUtf8 = false; if (inCharset.equalsIgnoreCase("utf8") || inCharset.equalsIgnoreCase("utf-8")) isStartUtf8 = true; if (outCharset.equalsIgnoreCase("utf8") || outCharset.equalsIgnoreCase("utf-8")) isEndUtf8 = true; if (isStartUtf8 && isEndUtf8) return UnicodeUtility.utf8Clean(env, str, null, isIgnoreErrors); // decode phase CharSequence unicodeStr; Decoder decoder; if (isStartUtf8) decoder = new Utf8Decoder(inCharset); else decoder = new GenericDecoder(inCharset); decoder.setIgnoreErrors(isIgnoreErrors); unicodeStr = decoder.decode(env, str); // encode phase Encoder encoder; if (isEndUtf8) encoder = new Utf8Encoder(); else encoder = Encoder.create(outCharset); encoder.setIgnoreErrors(isIgnoreErrors); return encoder.encode(env, unicodeStr); } }