VmUTF8Convert.java example

Explorer
jnode-master
/*
 * $Id$
 *
 * Copyright (C) 2003-2015 JNode.org
 *
 * This library is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation; either version 2.1 of the License, or
 * (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful, but 
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 
 * License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; If not, write to the Free Software Foundation, Inc., 
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */
 
package org.jnode.vm.classmgr;

import java.io.UTFDataFormatException;
import java.nio.ByteBuffer;
import org.jnode.vm.InternString;

/**
 * VM_UTF8Convert
 * <p/>
 * Abstract class that contains conversion routines to/from utf8 and/or
 * pseudo-utf8. It does not support utf8 encodings of more than 3 bytes.
 * <p/>
 * The difference between utf8 and pseudo-utf8 is the special treatment of null.
 * In utf8, null is encoded as a single byte directly, whereas in pseudo-utf8,
 * it is encoded as a two-byte sequence. See the JVM spec for more information.
 *
 * @author John Whaley
 */
public abstract class VmUTF8Convert {

    /**
     * Strictly check the format of the utf8/pseudo-utf8 byte array in fromUTF8.
     */
    static final boolean STRICTLY_CHECK_FORMAT = false;

    /**
     * Set fromUTF8 to not throw an exception when given a normal utf8 byte
     * array.
     */
    static final boolean ALLOW_NORMAL_UTF8 = false;

    /**
     * Set fromUTF8 to not throw an exception when given a pseudo utf8 byte
     * array.
     */
    static final boolean ALLOW_PSEUDO_UTF8 = true;

    /**
     * Set toUTF8 to write in pseudo-utf8 (rather than normal utf8).
     */
    static final boolean WRITE_PSEUDO_UTF8 = true;

    /**
     * Convert the given sequence of UTF8 coded bytes into an interned Java String.
     * <p>
     * The acceptable input formats are controlled by the STRICTLY_CHECK_FORMAT,
     * ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8 flags.
     *
     * @param data a buffer containing the UTF8 data.
     * @param result a temporary character buffer used to build the string.
     * @param length the number of bytes to pull from the data buffer.
     * @return the resulting String
     * @throws UTFDataFormatException if the UTF8 is invalid
     */
    public static String fromUTF8(ByteBuffer data, char[] result, int length) 
        throws UTFDataFormatException {
        int result_index = 0;
        for (int i = 0, n = length; i < n;) {
            byte b = data.get();
            i++;
            if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8)
                if (b == 0)
                    throw new UTFDataFormatException(
                        "0 byte encountered at location " + (i - 1));
            if (b >= 0) { // < 0x80 unsigned
                // in the range '\001' to '\177'
                result[result_index++] = (char) b;
                continue;
            }
            try {
                byte nb = data.get();
                i++;
                if (b < -32) { // < 0xe0 unsigned
                    // '\000' or in the range '\200' to '\u07FF'
                    char c = result[result_index++] = (char) (((b & 0x1f) << 6) | (nb & 0x3f));
                    if (STRICTLY_CHECK_FORMAT) {
                        if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80))
                            throw new UTFDataFormatException(
                                "invalid marker bits for double byte char at location "
                                    + (i - 2));
                        if (c < '\200') {
                            if (!ALLOW_PSEUDO_UTF8 || (c != '\000'))
                                throw new UTFDataFormatException(
                                    "encountered double byte char that should have been single byte at location "
                                        + (i - 2));
                        } else if (c > '\u07FF')
                            throw new UTFDataFormatException(
                                "encountered double byte char that should have been triple byte at location "
                                    + (i - 2));
                    }
                } else {
                    byte nnb = data.get();
                    i++;
                    // in the range '\u0800' to '\uFFFF'
                    char c = result[result_index++] = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f));
                    if (STRICTLY_CHECK_FORMAT) {
                        if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80)
                            || ((nnb & 0xc0) != 0x80))
                            throw new UTFDataFormatException(
                                "invalid marker bits for triple byte char at location "
                                    + (i - 3));
                        if (c < '\u0800')
                            throw new UTFDataFormatException(
                                "encountered triple byte char that should have been fewer bytes at location "
                                    + (i - 3));
                    }
                }
            } catch (ArrayIndexOutOfBoundsException e) {
                throw new UTFDataFormatException("unexpected end at location "
                    + i);
            }
        }
        return InternString.internString(new String(result, 0, result_index));
    }

    /**
     * Convert the given String into a sequence of (pseudo-)utf8 formatted
     * bytes.
     * <p/>
     * The output format is controlled by the WRITE_PSEUDO_UTF8 flag.
     *
     * @param s String to convert
     * @return array containing sequence of (pseudo-)utf8 formatted bytes
     */
    public static byte[] toUTF8(String s) {
        byte[] result = new byte[utfLength(s)];
        int result_index = 0;
        for (int i = 0, n = s.length(); i < n; ++i) {
            char c = s.charAt(i);
            // in all shifts below, c is an (unsigned) char,
            // so either >>> or >> is ok
            if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F))
                result[result_index++] = (byte) c;
            else if (c > 0x07FF) {
                result[result_index++] = (byte) (0xe0 | (byte) (c >> 12));
                result[result_index++] = (byte) (0x80 | ((c & 0xfc0) >> 6));
                result[result_index++] = (byte) (0x80 | (c & 0x3f));
            } else {
                result[result_index++] = (byte) (0xc0 | (byte) (c >> 6));
                result[result_index++] = (byte) (0x80 | (c & 0x3f));
            }
        }
        return result;
    }

    /**
     * Returns the length of a string's UTF encoded form.
     */
    public static int utfLength(String s) {
        int utflen = 0;
        for (int i = 0, n = s.length(); i < n; ++i) {
            int c = s.charAt(i);
            if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F))
                ++utflen;
            else if (c > 0x07FF)
                utflen += 3;
            else
                utflen += 2;
        }
        return utflen;
    }

    /**
     * Check whether the given sequence of bytes is valid (pseudo-)utf8.
     *
     * @param bytes byte array to check
     * @return true iff the given sequence is valid (pseudo-)utf8.
     */
    public static boolean check(byte[] bytes) {
        for (int i = 0, n = bytes.length; i < n;) {
            byte b = bytes[i++];
            if (!ALLOW_NORMAL_UTF8)
                if (b == 0) return false;
            if (b >= 0) { // < 0x80 unsigned
                // in the range '\001' to '\177'
                continue;
            }
            try {
                byte nb = bytes[i++];
                if (b < -32) { // < 0xe0 unsigned
                    // '\000' or in the range '\200' to '\u07FF'
                    char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f));
                    if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80))
                        return false;
                    if (c < '\200') {
                        if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) return false;
                    } else if (c > '\u07FF') return false;
                } else {
                    byte nnb = bytes[i++];
                    // in the range '\u0800' to '\uFFFF'
                    char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f));
                    if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80)
                        || ((nnb & 0xc0) != 0x80)) return false;
                    if (c < '\u0800') return false;
                }
            } catch (ArrayIndexOutOfBoundsException e) {
                return false;
            }
        }
        return true;
    }

}