UnicodeUtils.java example

Explorer
appengine-export-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.sanselan.util;

import java.io.UnsupportedEncodingException;

import org.apache.sanselan.common.BinaryConstants;

public abstract class UnicodeUtils implements BinaryConstants
{
    /**
     * This class should never be instantiated.
     */
    private UnicodeUtils()
    {
    }

    public static class UnicodeException extends Exception
    {
        public UnicodeException(String message)
        {
            super(message);
        }
    }

    // A default single-byte charset.
    public static final int CHAR_ENCODING_CODE_ISO_8859_1 = 0;
    public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM = 1;
    public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM = 2;
    public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM = 3;
    public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM = 4;
    public static final int CHAR_ENCODING_CODE_UTF_8 = 5;
    public static final int CHAR_ENCODING_CODE_AMBIGUOUS = -1;

    // /*
    // * Guess the character encoding of arbitrary character data in a data
    // * buffer.
    // *
    // * The data may not run to the end of the buffer; it may be terminated.
    // This
    // * makes the problem much harder, since the character data may be followed
    // * by arbitrary data.
    // */
    // public static int guessCharacterEncoding(byte bytes[], int index)
    // {
    // int length = bytes.length - index;
    //
    // if (length < 1)
    // return CHAR_ENCODING_CODE_AMBIGUOUS;
    //
    // if (length >= 2)
    // {
    // // look for BOM.
    //
    // int c1 = 0xff & bytes[index];
    // int c2 = 0xff & bytes[index + 1];
    // if (c1 == 0xFF && c2 == 0xFE)
    // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
    // else if (c1 == 0xFE && c2 == 0xFF)
    // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
    // }
    //
    // }
    //
    // /*
    // * Guess the character encoding of arbitrary character data in a data
    // * buffer.
    // *
    // * The data fills the entire buffer. If it is terminated, the terminator
    // * byte(s) will be the last bytes in the buffer.
    // *
    // * This makes the problem a bit easier.
    // */
    // public static int guessCharacterEncodingSimple(byte bytes[], int index)
    // throws UnicodeException
    // {
    // int length = bytes.length - index;
    //
    // if (length < 1)
    // return CHAR_ENCODING_CODE_AMBIGUOUS;
    //
    // if (length >= 2)
    // {
    // // identify or eliminate UTF-16 with a BOM.
    //
    // int c1 = 0xff & bytes[index];
    // int c2 = 0xff & bytes[index + 1];
    // if (c1 == 0xFF && c2 == 0xFE)
    // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
    // else if (c1 == 0xFE && c2 == 0xFF)
    // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
    // }
    //
    // if (length >= 2)
    // {
    // // look for optional double-byte terminator.
    //
    // int c1 = 0xff & bytes[bytes.length - 2];
    // int c2 = 0xff & bytes[bytes.length - 1];
    // if (c1 == 0 && c2 == 0)
    // {
    // // definitely a flavor of UTF-16.
    // if (length % 2 != 0)
    // throw new UnicodeException(
    // "Character data with double-byte terminator has an odd length.");
    //
    // boolean mayHaveTerminator = true;
    // boolean mustHaveTerminator = false;
    // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
    // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
    // mayHaveTerminator, mustHaveTerminator);
    // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
    // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
    // mayHaveTerminator, mustHaveTerminator);
    // if ((!possibleBigEndian) && (!possibleLittleEndian))
    // throw new UnicodeException(
    // "Invalid character data, possibly UTF-16.");
    // if (possibleBigEndian && possibleLittleEndian)
    // return CHAR_ENCODING_CODE_AMBIGUOUS;
    // if (possibleBigEndian)
    // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
    // if (possibleLittleEndian)
    // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
    // }
    // }
    //
    // List possibleEncodings = new ArrayList();
    // if (length % 2 == 0)
    // {
    // boolean mayHaveTerminator = true;
    // boolean mustHaveTerminator = false;
    // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
    // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
    // mayHaveTerminator, mustHaveTerminator);
    // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
    // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
    // mayHaveTerminator, mustHaveTerminator);
    //
    // if (possibleBigEndian)
    // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
    // if (possibleLittleEndian)
    // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
    // }
    //
    // }

    public static final boolean isValidISO_8859_1(String s)
    {
        try
        {
            String roundtrip = new String(s.getBytes("ISO-8859-1"),
                    "ISO-8859-1");
            return s.equals(roundtrip);
        } catch (UnsupportedEncodingException e)
        {
            // should never be thrown.
            throw new RuntimeException("Error parsing string.", e);
        }
    }

    /*
     * Return the index of the first utf-16 terminator (ie. two even-aligned
     * nulls). If not found, return -1.
     */
    private static int findFirstDoubleByteTerminator(byte bytes[], int index)
    {
        for (int i = index; i < bytes.length - 1; i += 2)
        {
            int c1 = 0xff & bytes[index];
            int c2 = 0xff & bytes[index + 1];
            if (c1 == 0 && c2 == 0)
                return i;
        }
        return -1;
    }

    public final int findEndWithTerminator(byte bytes[], int index)
            throws UnicodeException
    {
        return findEnd(bytes, index, true);
    }

    public final int findEndWithoutTerminator(byte bytes[], int index)
            throws UnicodeException
    {
        return findEnd(bytes, index, false);
    }

    protected abstract int findEnd(byte bytes[], int index,
            boolean includeTerminator) throws UnicodeException;

    public static UnicodeUtils getInstance(int charEncodingCode)
            throws UnicodeException
    {
        switch (charEncodingCode)
        {
        case CHAR_ENCODING_CODE_ISO_8859_1:
            return new UnicodeMetricsASCII();
        case CHAR_ENCODING_CODE_UTF_8:
            // Debug.debug("CHAR_ENCODING_CODE_UTF_8");
            return new UnicodeMetricsUTF8();
        case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM:
        case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM:
            // Debug.debug("CHAR_ENCODING_CODE_UTF_16_WITH_BOM");
            return new UnicodeMetricsUTF16WithBOM();
        case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM:
            return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_BIG_ENDIAN);
        case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM:
            return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_LITTLE_ENDIAN);
        default:
            throw new UnicodeException("Unknown char encoding code: "
                    + charEncodingCode);
        }
    }

    private static class UnicodeMetricsASCII extends UnicodeUtils
    {
        public int findEnd(byte bytes[], int index, boolean includeTerminator)
                throws UnicodeException
        {
            for (int i = index; i < bytes.length; i++)
            {
                if (bytes[i] == 0)
                    return includeTerminator ? i + 1 : i;
            }
            return bytes.length;
            // throw new UnicodeException("Terminator not found.");
        }
    }

    // private static class UnicodeMetricsISO_8859_1 extends UnicodeUtils
    // {
    // public int findEnd(byte bytes[], int index, boolean includeTerminator)
    // throws UnicodeException
    // {
    // for (int i = index; i < bytes.length; i++)
    // {
    // if (bytes[i] == 0)
    // return includeTerminator ? i + 1 : i;
    // }
    // return bytes.length;
    // // throw new UnicodeException("Terminator not found.");
    // }
    // }

    private static class UnicodeMetricsUTF8 extends UnicodeUtils
    {

        public int findEnd(byte bytes[], int index, boolean includeTerminator)
                throws UnicodeException
        {
            // http://en.wikipedia.org/wiki/UTF-8

            while (true)
            {
                if (index == bytes.length)
                    return bytes.length;
                if (index > bytes.length)
                    throw new UnicodeException("Terminator not found.");

                int c1 = 0xff & bytes[index++];
                if (c1 == 0)
                    return includeTerminator ? index : index - 1;
                else if (c1 <= 0x7f)
                    continue;
                else if (c1 <= 0xDF)
                {
                    if (index >= bytes.length)
                        throw new UnicodeException("Invalid unicode.");

                    int c2 = 0xff & bytes[index++];
                    if (c2 < 0x80 || c2 > 0xBF)
                        throw new UnicodeException("Invalid code point.");
                } else if (c1 <= 0xEF)
                {
                    if (index >= bytes.length - 1)
                        throw new UnicodeException("Invalid unicode.");

                    int c2 = 0xff & bytes[index++];
                    if (c2 < 0x80 || c2 > 0xBF)
                        throw new UnicodeException("Invalid code point.");
                    int c3 = 0xff & bytes[index++];
                    if (c3 < 0x80 || c3 > 0xBF)
                        throw new UnicodeException("Invalid code point.");
                } else if (c1 <= 0xF4)
                {
                    if (index >= bytes.length - 2)
                        throw new UnicodeException("Invalid unicode.");

                    int c2 = 0xff & bytes[index++];
                    if (c2 < 0x80 || c2 > 0xBF)
                        throw new UnicodeException("Invalid code point.");
                    int c3 = 0xff & bytes[index++];
                    if (c3 < 0x80 || c3 > 0xBF)
                        throw new UnicodeException("Invalid code point.");
                    int c4 = 0xff & bytes[index++];
                    if (c4 < 0x80 || c4 > 0xBF)
                        throw new UnicodeException("Invalid code point.");
                } else
                    throw new UnicodeException("Invalid code point.");
            }
        }
    }

    private abstract static class UnicodeMetricsUTF16 extends UnicodeUtils
    {
        protected static final int BYTE_ORDER_BIG_ENDIAN = 0;
        protected static final int BYTE_ORDER_LITTLE_ENDIAN = 1;
        protected int byteOrder = BYTE_ORDER_BIG_ENDIAN;

        public UnicodeMetricsUTF16(int byteOrder)
        {
            this.byteOrder = byteOrder;
        }

        public boolean isValid(byte bytes[], int index,
                boolean mayHaveTerminator, boolean mustHaveTerminator)
                throws UnicodeException
        {
            // http://en.wikipedia.org/wiki/UTF-16/UCS-2

            while (true)
            {
                if (index == bytes.length)
                {
                    // end of buffer, no terminator found.
                    return !mustHaveTerminator;
                }

                if (index >= bytes.length - 1)
                {
                    // end of odd-length buffer, no terminator found.
                    return false;
                }

                int c1 = 0xff & bytes[index++];
                int c2 = 0xff & bytes[index++];
                int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2;

                if (c1 == 0 && c2 == 0)
                {
                    // terminator found.
                    return mayHaveTerminator;
                }

                if (msb1 >= 0xD8)
                {
                    // Surrogate pair found.

                    if (msb1 >= 0xDC)
                    {
                        // invalid first surrogate.
                        return false;
                    }

                    if (index >= bytes.length - 1)
                    {
                        // missing second surrogate.
                        return false;
                    }

                    // second word.
                    int c3 = 0xff & bytes[index++];
                    int c4 = 0xff & bytes[index++];
                    int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
                    if (msb2 < 0xDC)
                    {
                        // invalid second surrogate.
                        return false;
                    }
                }
            }
        }

        public int findEnd(byte bytes[], int index, boolean includeTerminator)
                throws UnicodeException
        {
            // http://en.wikipedia.org/wiki/UTF-16/UCS-2

            while (true)
            {
                if (index == bytes.length)
                    return bytes.length;
                if (index > bytes.length - 1)
                    throw new UnicodeException("Terminator not found.");

                int c1 = 0xff & bytes[index++];
                int c2 = 0xff & bytes[index++];
                int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2;

                if (c1 == 0 && c2 == 0)
                {
                    return includeTerminator ? index : index - 2;
                } else if (msb1 >= 0xD8)
                {
                    if (index > bytes.length - 1)
                        throw new UnicodeException("Terminator not found.");

                    // second word.
                    int c3 = 0xff & bytes[index++];
                    int c4 = 0xff & bytes[index++];
                    int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
                    if (msb2 < 0xDC)
                        throw new UnicodeException("Invalid code point.");
                }
            }
        }
    }

    private static class UnicodeMetricsUTF16NoBOM extends UnicodeMetricsUTF16
    {

        public UnicodeMetricsUTF16NoBOM(final int byteOrder)
        {
            super(byteOrder);
        }

    }

    private static class UnicodeMetricsUTF16WithBOM extends UnicodeMetricsUTF16
    {

        public UnicodeMetricsUTF16WithBOM()
        {
            super(BYTE_ORDER_BIG_ENDIAN);
        }

        public int findEnd(byte bytes[], int index, boolean includeTerminator)
                throws UnicodeException
        {
            // http://en.wikipedia.org/wiki/UTF-16/UCS-2

            if (index >= bytes.length - 1)
                throw new UnicodeException("Missing BOM.");

            int c1 = 0xff & bytes[index++];
            int c2 = 0xff & bytes[index++];
            if (c1 == 0xFF && c2 == 0xFE)
                byteOrder = BYTE_ORDER_LITTLE_ENDIAN;
            else if (c1 == 0xFE && c2 == 0xFF)
                byteOrder = BYTE_ORDER_BIG_ENDIAN;
            else
                throw new UnicodeException("Invalid byte order mark.");

            return super.findEnd(bytes, index, includeTerminator);
        }
    }

}