Utf8Decoder.java example

Explorer
resin-master
/*
 * Copyright (c) 1998-2011 Caucho Technology -- all rights reserved
 *
 * This file is part of Resin(R) Open Source
 *
 * Each copy or derived work must preserve the copyright notice and this
 * notice unmodified.
 *
 * Resin Open Source is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * Resin Open Source is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
 * of NON-INFRINGEMENT.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Resin Open Source; if not, write to the
 *
 *   Free Software Foundation, Inc.
 *   59 Temple Place, Suite 330
 *   Boston, MA 02111-1307  USA
 *
 * @author Nam Nguyen
 */

package com.caucho.quercus.lib.i18n;

import java.util.logging.Logger;

import com.caucho.quercus.env.Env;
import com.caucho.quercus.env.StringValue;
import com.caucho.util.L10N;

public class Utf8Decoder
  extends Decoder
{
  private static final Logger log
    = Logger.getLogger(Utf8Decoder.class.getName());

  private static final L10N L = new L10N(Utf8Decoder.class);
  
  private static final int ERROR_CHARACTER = 0xFFFE;
  private static final int EOF = -1;
  
  public Utf8Decoder(String charset)
  {
    super(charset);
  }
  
  public boolean isUtf8()
  {
    return true;
  }
  
  @Override
  public boolean isDecodable(Env env, StringValue str)
  {
    if (str.isUnicode())
      return true;

    Utf8Reader reader = new Utf8Reader(str);
    
    int ch;
    
    while ((ch = reader.read()) >= 0) {
      if (ch == ERROR_CHARACTER)
        return false;
    }

    return true;
  }
  
  @Override
  protected StringBuilder decodeImpl(Env env, StringValue str)
  {
    StringBuilder sb = new StringBuilder();
    
    int len = str.length();
    for (int i = 0; i < len; i++) {
      int ch = str.charAt(i);
      
      if (ch <= 0x7F)
        sb.append((char) ch);
      else if (0xC2 <= ch && ch <= 0xDF) {
        int ch2;
        if (i + 1 < len
            && 0x80 <= (ch2 = str.charAt(i + 1)) && ch2 <= 0xBF) {
          i++;
          
          int code = ((ch - 0xC0) << 6) + (ch2 - 0x80);
          
          sb.append((char) code);
        }
        else if (_isIgnoreErrors) {
        }
        else if (_replacement != null)
          sb.append(_replacement);
        else if (_isAllowMalformedOut)
          sb.append((char) ch);
        else
          return sb;
      }
      else if (0xE0 <= ch && ch <= 0xEF) {
        int ch2;
        int ch3;
        if (i + 2 < len
            && 0x80 <= (ch2 = str.charAt(i + 1)) && ch2 <= 0xBF
            && 0x80 <= (ch3 = str.charAt(i + 2)) && ch3 <= 0xBF) {
          i += 2;

          int code = ((ch - 0xE0) << 12)
                     + ((ch2 - 0x80) << 6)
                     + (ch3 - 0x80);
          
          if (0xD800 <= code && code <= 0xDBFF) {
            code &= 0xFFFFF;
            
            int high = 0xD800 + (code >> 10);
            int low = 0xDC00 + (code & 0x3FF);
            
            sb.append((char) high);
            sb.append((char) low);
          }
          else
            sb.append((char) code);
        }
        else if (_isIgnoreErrors) {
        }
        else if (_replacement != null)
          sb.append(_replacement);
        else if (_isAllowMalformedOut)
          sb.append((char) ch);
        else
          return sb;
      }
      else if (0xF0 <= ch && ch <= 0xF4) {
        int ch2;
        int ch3;
        int ch4;
        
        if (i + 3 < len
            && 0x80 <= (ch2 = str.charAt(i + 1)) && ch2 <= 0xBF
            && 0x80 <= (ch3 = str.charAt(i + 2)) && ch3 <= 0xBF
            && 0x80 <= (ch4 = str.charAt(i + 3)) && ch4 <= 0xBF) {
          i += 3;
          
          int code = ((ch - 0xF0) << 18)
                     + ((ch2 - 0x80) << 12)
                     + ((ch3 - 0x80) << 6)
                     + (ch4 - 0x80);
          
          if (code > 0xFFFF || 0xD800 <= code && code <= 0xDBFF) {
            code &= 0xFFFFF;
            
            int high = 0xD800 + code >> 10;
            int low = 0xDC00 + code & 0x3FF;
            
            sb.append((char) high);
            sb.append((char) low);
          }
          else
            sb.append((char) code);
        }
        else if (_isIgnoreErrors) {
        }
        else if (_replacement != null)
          sb.append(_replacement);
        else if (_isAllowMalformedOut)
          sb.append((char) ch);
        else
          return sb;
      }
      else if (_isIgnoreErrors) {
      }
      else if (_replacement != null)
        sb.append(_replacement);
      else if (_isAllowMalformedOut)
        sb.append((char) ch);
      else
        return sb;
    }
    
    /*
    Utf8Reader reader = new Utf8Reader(str);

    int ch;
    
    while ((ch = reader.read()) >= 0) {
      if (ch == ERROR_CHARACTER) {
        _hasError = true;
        
        if (_isIgnoreErrors) {
        }
        else if (_replacement != null)
          sb.append(_replacement);
        else
          return sb;
      }
      else
        sb.append((char) ch);
    }
    */

    return sb;
  }
  
  private static void decodeCodePoint(StringBuilder sb, int code)
  {
    code &= 0xFFFFF;
    
    int high = 0xD800 + code >> 10;
    int low = 0xDC00 + code & 0x3FF;
    
    sb.append((char) high);
    sb.append((char) low);
  }
  
  static class Utf8Reader
  {
    int _peek = -1;
    
    int _index;
    final int _len;
    StringValue _str;
    
    public Utf8Reader(StringValue str)
    {
      _str = str;
      _len = str.length();
    }
    
    public int read()
    {
      int ch1;
      if (_peek >= 0) {
        ch1 = _peek;
        _peek = -1;
      }
      else
        ch1 = readByte();

      if (ch1 < 0x80) {
        return ch1;
      }
      if ((ch1 & 0xe0) == 0xc0) {
        int ch2 = readByte();

        if (ch2 < 0)
          return ERROR_CHARACTER;
        else if ((ch2 & 0xc0) != 0x80) {
          unread();
          return ERROR_CHARACTER;
        }
        
        return ((ch1 & 0x1f) << 6) + (ch2 & 0x3f);
      }
      else if ((ch1 & 0xf0) == 0xe0) {
        int ch2 = readByte();
        
        if (ch2 < 0)
          return ERROR_CHARACTER;
        else if ((ch2 & 0xc0) != 0x80) {
          unread();
          return ERROR_CHARACTER;
        }
        
        int ch3 = readByte();
        
        if (ch3 < 0) {
          unread();
          return ERROR_CHARACTER;
        }
        else if ((ch3 & 0xc0) != 0x80) {
          unread();
          unread();
          return ERROR_CHARACTER;
        }

        int ch = ((ch1 & 0x1f) << 12) + ((ch2 & 0x3f) << 6) + (ch3 & 0x3f);

        if (ch == 0xfeff) // handle some writers, e.g. microsoft
          return readByte();
        else
          return ch;
      }
      else if ((ch1 & 0xf0) == 0xf0) {
        int ch2 = readByte();
        
        if (ch2 < 0)
          return ERROR_CHARACTER;
        else if ((ch2 & 0xc0) != 0x80) {
          unread();
          return ERROR_CHARACTER;
        }
        
        int ch3 = readByte();
        
        if (ch3 < 0) {
          unread();
          return ERROR_CHARACTER;
        }
        else if ((ch3 & 0xc0) != 0x80) {
          unread();
          unread();
          return ERROR_CHARACTER;
        }
        
        int ch4 = readByte();
        
        if (ch4 < 0) {
          unread();
          unread();
          
          return ERROR_CHARACTER;
        }
        else if ((ch4 & 0xc0) != 0x80) {
          unread();
          unread();
          unread();
          return ERROR_CHARACTER;
        }
        
        int ch = (((ch1 & 0xf) << 18)
            + ((ch2 & 0x3f) << 12)
            + ((ch3 & 0x3f) << 6)
            + ((ch4 & 0x3f)));

        _peek = 0xdc00 + (ch & 0x3ff);
        
        return 0xd800 + ((ch - 0x10000) / 0x400);
      }
      else
        return ERROR_CHARACTER;
    }
    
    private int readByte()
    {
      if (_index < _len)
        return _str.charAt(_index++);
      else
        return EOF;
    }
    
    private void unread()
    {
      _index--;
    }
  }
}