UTF_16_Reader.java example

Explorer
pluotsorbet-master
/*
 *   
 *
 * Copyright  1990-2009 Sun Microsystems, Inc. All Rights Reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
 * 
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License version
 * 2 only, as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License version 2 for more details (a copy is
 * included at /legal/license.txt).
 * 
 * You should have received a copy of the GNU General Public License
 * version 2 along with this work; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA
 * 
 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
 * Clara, CA 95054 or visit www.sun.com if you need additional
 * information or have any questions.
 */

package com.sun.cldc.i18n.j2me;

import java.io.*;


/** Reader for UTF-16 encoded input streams. */
public class UTF_16_Reader extends com.sun.cldc.i18n.StreamReader {

    /** the first byte of a pair of bytes that represent a 16-bit char */
    protected int firstByte = -1;
    /** the byteOrder variable has this value when the byte order
     * has not yet been specified or detected */
    protected static final int UNKNOWN_BYTE_ORDER = 0;
    /** the byteOrder variable has this value when the byte order
     * is Big Endian */
    protected static final int BIG_ENDIAN = 1;
    /** the byteOrder variable has this value when the byte order
     * is Little Endian */
    protected static final int LITTLE_ENDIAN = 2;
    /** the byte order: one of BIG_ENDIAN, LITTLE_ENDIAN, UNKNOWN_BYTE_ORDER */
    protected int byteOrder = UNKNOWN_BYTE_ORDER;

    /** mark() saves here a copy of firstByte */
    protected int markFirstByte;
    /** mark() saves here a copy of byteOrder */
    protected int markByteOrder;
    /** false if mark() has not been invoked yet */
    protected boolean markIsSet;
    /** The amount of bytes that mark() must reserve for BOM.
     *  Derived classes may set this field to 0.
     */
    protected int bytesForBOM;
    /** One Java (utf-16) character is 2 bytes.
     *  For the purposes of this class, we consider surrogate pairs as
     *  sequences of two Java characters.
     */
    protected static final int BYTES_PER_CHAR = 2;

    /** Constructs a UTF-16 reader. */
    public UTF_16_Reader() {
        bytesForBOM = 2;
    }

    /**
     * Open the reader
     * @param in the input stream to be read
     * @param enc identifies the encoding to be used
     * @return a reader for the given input stream and encoding
     * @throws UnsupportedEncodingException
     */
    public Reader open(InputStream in, String enc)
        throws UnsupportedEncodingException {
        firstByte = -1;
        byteOrder = UNKNOWN_BYTE_ORDER;
        markIsSet = false;
        super.open(in, enc);
        return this;
    }

    /** Convert two bytes to a 16-bit char
     * assuming the big endian byte order.
     * @param firstByte the first of two bytes representing a char
     * @param secondByte the second of two bytes representing a char
     * @return the character represented by the two bytes
     */
    protected char mergeBytesBigEndian(int firstByte, int secondByte) {
        return (char) ((firstByte << 8) + secondByte);
    }
    /** Convert two bytes to a 16-bit char
     * assuming the little endian byte order.
     * @param firstByte the first of two bytes representing a char
     * @param secondByte the second of two bytes representing a char
     * @return the character represented by the two bytes
     */
    protected char mergeBytesLittleEndian(int firstByte, int secondByte) {
        return (char) ((secondByte << 8) + firstByte);
    }
    /** Convert two bytes to a 16-bit char
     * using the current byte order.
     * @param firstByte the first of two bytes representing a char
     * @param secondByte the second of two bytes representing a char
     * @return the character represented by the two bytes
     */
    protected char mergeBytes(int firstByte, int secondByte) {
        if (byteOrder == BIG_ENDIAN) {
            return mergeBytesBigEndian(firstByte,secondByte);
        } else { // if (byteOrder == LITTLE_ENDIAN)
            return mergeBytesLittleEndian(firstByte,secondByte);
        }
    }

    /**
     * If the two argument bytes represent a Byte Order Mark (BOM),
     * set the byteOrder member to the corresponding byte order constant;
     * else set it to the default byte order.
     * @param firstByte the first of two bytes representing a char or BOM
     * @param secondByte the second of two bytes representing a char or BOM
     * @return true if it was a byte order mark, false it it was data
     */
    protected boolean bomDetect(int firstByte, int secondByte) {
        if (firstByte == 0xFE && secondByte == 0xFF) {
            byteOrder = BIG_ENDIAN;
            return true;
        } else if (firstByte == 0xFF && secondByte == 0xFE) {
            byteOrder = LITTLE_ENDIAN;
            return  true;
        } else { // default
            // The UTF-16 FAQ says that in absence of BOM
            // big-endian byte serialization is used.
            byteOrder = BIG_ENDIAN;
            return false;
        }
    }

    /**
     * Read a block of UTF16 characters.
     *
     * @param cbuf output buffer for converted characters read
     * @param off initial offset into the provided buffer
     * @param len length of characters in the buffer
     * @return the number of converted characters
     * @exception IOException is thrown if the input stream 
     * could not be read for the raw unconverted character
     */
    public int read(char cbuf[], int off, int len) throws IOException {
        int count = 0;
        int secondByte;
        if (len == 0) {
            return 0;
        }

        if (firstByte == -1) {
            firstByte = in.read();
        }
        for ( ; count < len; firstByte = in.read()) {
            if( -1 == firstByte || -1 == (secondByte = in.read())) {
                return (0 == count) ? -1 : count;
            }

            if (byteOrder == UNKNOWN_BYTE_ORDER) {
                // only for the first two bytes: examine BOM
                final boolean itWasBOM = bomDetect(firstByte,secondByte);
                if (!itWasBOM) {
                    cbuf[off + count] = mergeBytes(firstByte,secondByte);
                    count++;
                }
            } else {
                cbuf[off + count] = mergeBytes(firstByte,secondByte);
                count++;
            }
        }
        return count;
    }

    /**
     * Mark the present position in the stream.
     *
     * @param readAheadLimit number of characters to buffer ahead
     * @exception  IOException  If an I/O error occurs or
     *             marking is not supported by the underlying input stream.
     */
    public void mark(int readAheadLimit) throws IOException {
        if (in.markSupported()) {
            markIsSet = true;
            markByteOrder = byteOrder;
            markFirstByte = firstByte;
            in.mark(readAheadLimit*BYTES_PER_CHAR + bytesForBOM);
        } else {
            throw new IOException("mark() not supported");
        }
    }

    /**
     * Reset the read ahead marks is not supported for UTF16 readers.
     * @exception IOException is thrown, for all calls to this method
     * because marking is not supported for UTF16 readers
     */
    public void reset() throws IOException {
        if (in.markSupported()) {
            byteOrder = markByteOrder;
            firstByte = markFirstByte;
            in.reset();
        } else {
            throw new IOException("reset() not supported");
        }
    }

    /**
     * Get the size in chars of an array of bytes.
     *
     * @param      array  Source buffer
     * @param      offset Offset at which to start counting characters
     * @param      length number of bytes to use for counting
     *
     * @return     number of characters that would be converted
     */
    /*
     * This method is only used by our internal Helper class in the method
     * byteToCharArray to know how much to allocate before using a
     * reader. If we encounter bad encoding we should return a count
     * that includes that character so the reader will throw an IOException
     */
    public int sizeOf(byte[] array, int offset, int length) {
        int b1 = 0xff & array[0];
        int b2 = 0xff & array[1];
        if ((b1 == 0xfe && b2 == 0xff)
          ||(b1 == 0xff && b2 == 0xfe)){
            // do not count BOM, it's not a part of data
            return length/BYTES_PER_CHAR - 1;
        }
        return length/BYTES_PER_CHAR;
    }
}