UnicodeUtility.java example

Explorer
resin-master
/*
 * Copyright (c) 1998-2011 Caucho Technology -- all rights reserved
 *
 * This file is part of Resin(R) Open Source
 *
 * Each copy or derived work must preserve the copyright notice and this
 * notice unmodified.
 *
 * Resin Open Source is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * Resin Open Source is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
 * of NON-INFRINGEMENT.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Resin Open Source; if not, write to the
 *
 *   Free Software Foundation, Inc.
 *   59 Temple Place, Suite 330
 *   Boston, MA 02111-1307  USA
 *
 * @author Nam Nguyen
 */

package com.caucho.quercus.lib.i18n;

import com.caucho.quercus.env.*;

public class UnicodeUtility
{
  public static StringValue utf8Clean(Env env,
                                      StringValue str,
                                      String replacement,
                                      boolean isIgnore)
  {
    StringValue sb = str.createStringBuilder();
    
    int len = str.length();
    
    for (int i = 0; i < len; i++) {
      char ch = str.charAt(i);
      
      if (ch <= 0x7F)
        sb.append(ch);
      else if (0xC2 <= ch && ch <= 0xDF) {
        char ch2;
        if (i + 1 < len
            && 0x80 <= (ch2 = str.charAt(i + 1)) && ch2 <= 0xBF) {
          i++;
          sb.append(ch);
          sb.append(ch2);
        }
        else if (isIgnore) {
        }
        else if (replacement != null)
          sb.append(replacement);
        else
          return sb;
      }
      else if (0xE0 <= ch && ch <= 0xEF) {
        char ch2;
        char ch3;
        if (i + 2 < len
            && 0x80 <= (ch2 = str.charAt(i + 1)) && ch2 <= 0xBF
            && 0x80 <= (ch3 = str.charAt(i + 2)) && ch3 <= 0xBF) {
          i += 2;
          sb.append(ch);
          sb.append(ch2);
          sb.append(ch3);
        }
        else if (isIgnore) {
        }
        else if (replacement != null)
          sb.append(replacement);
        else
          return sb;
      }
      else if (0xF0 <= ch && ch <= 0xF4) {
        char ch2;
        char ch3;
        char ch4;
        
        if (i + 3 < len
            && 0x80 <= (ch2 = str.charAt(i + 1)) && ch2 <= 0xBF
            && 0x80 <= (ch3 = str.charAt(i + 2)) && ch3 <= 0xBF
            && 0x80 <= (ch4 = str.charAt(i + 3)) && ch4 <= 0xBF) {
          i += 3;
          sb.append(ch);
          sb.append(ch2);
          sb.append(ch3);
          sb.append(ch4);
        }
        else if (isIgnore) {
        }
        else if (replacement != null)
          sb.append(replacement);
        else
          return sb;
      }
      else if (isIgnore) {
      }
      else if (replacement != null)
        sb.append(replacement);
      else
        return sb;
    }
    
    return sb;
  }
  
  public static CharSequence decode(Env env,
                                    StringValue str,
                                    String charset)
  {
    return decode(env, str, charset, null, false);
  }
  
  public static CharSequence decode(Env env,
                                    StringValue str,
                                    String charset,
                                    String replacement,
                                    boolean isIgnoreErrors)
  {
    Decoder decoder = Decoder.create(charset);
    
    decoder.setReplacement(replacement);
    decoder.setIgnoreErrors(isIgnoreErrors);
    
    return decoder.decode(env, str);
  }
  
  public static StringValue encode(Env env,
                                   CharSequence str,
                                   String charset)
  {
    return encode(env, str, charset, null, false);
  }
  
  public static StringValue encode(Env env,
                                   CharSequence str,
                                   String charset,
                                   String replacement,
                                   boolean isIgnoreErrors)
  {
    Encoder encoder = Encoder.create(charset);
    
    encoder.setReplacement(replacement);
    encoder.setIgnoreErrors(isIgnoreErrors);
    
    return encoder.encode(env, str);
  }
  
  public static StringValue decodeEncode(Env env,
                                         StringValue str,
                                         String inCharset,
                                         String outCharset,
                                         String replacement,
                                         boolean isIgnoreErrors)
  {
    boolean isStartUtf8 = false;
    boolean isEndUtf8 = false;
    
    if (inCharset.equalsIgnoreCase("utf8")
        || inCharset.equalsIgnoreCase("utf-8"))
      isStartUtf8 = true;
    
    if (outCharset.equalsIgnoreCase("utf8")
        || outCharset.equalsIgnoreCase("utf-8"))
      isEndUtf8 = true;
    
    if (isStartUtf8 && isEndUtf8)
      return UnicodeUtility.utf8Clean(env, str, null, isIgnoreErrors);
    
    // decode phase
    
    CharSequence unicodeStr;
    
    Decoder decoder;
    if (isStartUtf8)
      decoder = new Utf8Decoder(inCharset);
    else
      decoder = new GenericDecoder(inCharset);
    
    decoder.setIgnoreErrors(isIgnoreErrors);

    unicodeStr = decoder.decode(env, str);
    
    // encode phase
    
    Encoder encoder;
    if (isEndUtf8)
      encoder = new Utf8Encoder();
    else
      encoder = Encoder.create(outCharset);
    
    encoder.setIgnoreErrors(isIgnoreErrors);
    
    return encoder.encode(env, unicodeStr);
  }
}