/*
*
*
* Copyright 1990-2009 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 only, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License version 2 for more details (a copy is
* included at /legal/license.txt).
*
* You should have received a copy of the GNU General Public License
* version 2 along with this work; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA
*
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
* Clara, CA 95054 or visit www.sun.com if you need additional
* information or have any questions.
*/
package com.sun.cldc.i18n.j2me;
import java.io.*;
/** Reader for UTF-16 encoded input streams. */
public class UTF_16_Reader extends com.sun.cldc.i18n.StreamReader {
/** the first byte of a pair of bytes that represent a 16-bit char */
protected int firstByte = -1;
/** the byteOrder variable has this value when the byte order
* has not yet been specified or detected */
protected static final int UNKNOWN_BYTE_ORDER = 0;
/** the byteOrder variable has this value when the byte order
* is Big Endian */
protected static final int BIG_ENDIAN = 1;
/** the byteOrder variable has this value when the byte order
* is Little Endian */
protected static final int LITTLE_ENDIAN = 2;
/** the byte order: one of BIG_ENDIAN, LITTLE_ENDIAN, UNKNOWN_BYTE_ORDER */
protected int byteOrder = UNKNOWN_BYTE_ORDER;
/** mark() saves here a copy of firstByte */
protected int markFirstByte;
/** mark() saves here a copy of byteOrder */
protected int markByteOrder;
/** false if mark() has not been invoked yet */
protected boolean markIsSet;
/** The amount of bytes that mark() must reserve for BOM.
* Derived classes may set this field to 0.
*/
protected int bytesForBOM;
/** One Java (utf-16) character is 2 bytes.
* For the purposes of this class, we consider surrogate pairs as
* sequences of two Java characters.
*/
protected static final int BYTES_PER_CHAR = 2;
/** Constructs a UTF-16 reader. */
public UTF_16_Reader() {
bytesForBOM = 2;
}
/**
* Open the reader
* @param in the input stream to be read
* @param enc identifies the encoding to be used
* @return a reader for the given input stream and encoding
* @throws UnsupportedEncodingException
*/
public Reader open(InputStream in, String enc)
throws UnsupportedEncodingException {
firstByte = -1;
byteOrder = UNKNOWN_BYTE_ORDER;
markIsSet = false;
super.open(in, enc);
return this;
}
/** Convert two bytes to a 16-bit char
* assuming the big endian byte order.
* @param firstByte the first of two bytes representing a char
* @param secondByte the second of two bytes representing a char
* @return the character represented by the two bytes
*/
protected char mergeBytesBigEndian(int firstByte, int secondByte) {
return (char) ((firstByte << 8) + secondByte);
}
/** Convert two bytes to a 16-bit char
* assuming the little endian byte order.
* @param firstByte the first of two bytes representing a char
* @param secondByte the second of two bytes representing a char
* @return the character represented by the two bytes
*/
protected char mergeBytesLittleEndian(int firstByte, int secondByte) {
return (char) ((secondByte << 8) + firstByte);
}
/** Convert two bytes to a 16-bit char
* using the current byte order.
* @param firstByte the first of two bytes representing a char
* @param secondByte the second of two bytes representing a char
* @return the character represented by the two bytes
*/
protected char mergeBytes(int firstByte, int secondByte) {
if (byteOrder == BIG_ENDIAN) {
return mergeBytesBigEndian(firstByte,secondByte);
} else { // if (byteOrder == LITTLE_ENDIAN)
return mergeBytesLittleEndian(firstByte,secondByte);
}
}
/**
* If the two argument bytes represent a Byte Order Mark (BOM),
* set the byteOrder member to the corresponding byte order constant;
* else set it to the default byte order.
* @param firstByte the first of two bytes representing a char or BOM
* @param secondByte the second of two bytes representing a char or BOM
* @return true if it was a byte order mark, false it it was data
*/
protected boolean bomDetect(int firstByte, int secondByte) {
if (firstByte == 0xFE && secondByte == 0xFF) {
byteOrder = BIG_ENDIAN;
return true;
} else if (firstByte == 0xFF && secondByte == 0xFE) {
byteOrder = LITTLE_ENDIAN;
return true;
} else { // default
// The UTF-16 FAQ says that in absence of BOM
// big-endian byte serialization is used.
byteOrder = BIG_ENDIAN;
return false;
}
}
/**
* Read a block of UTF16 characters.
*
* @param cbuf output buffer for converted characters read
* @param off initial offset into the provided buffer
* @param len length of characters in the buffer
* @return the number of converted characters
* @exception IOException is thrown if the input stream
* could not be read for the raw unconverted character
*/
public int read(char cbuf[], int off, int len) throws IOException {
int count = 0;
int secondByte;
if (len == 0) {
return 0;
}
if (firstByte == -1) {
firstByte = in.read();
}
for ( ; count < len; firstByte = in.read()) {
if( -1 == firstByte || -1 == (secondByte = in.read())) {
return (0 == count) ? -1 : count;
}
if (byteOrder == UNKNOWN_BYTE_ORDER) {
// only for the first two bytes: examine BOM
final boolean itWasBOM = bomDetect(firstByte,secondByte);
if (!itWasBOM) {
cbuf[off + count] = mergeBytes(firstByte,secondByte);
count++;
}
} else {
cbuf[off + count] = mergeBytes(firstByte,secondByte);
count++;
}
}
return count;
}
/**
* Mark the present position in the stream.
*
* @param readAheadLimit number of characters to buffer ahead
* @exception IOException If an I/O error occurs or
* marking is not supported by the underlying input stream.
*/
public void mark(int readAheadLimit) throws IOException {
if (in.markSupported()) {
markIsSet = true;
markByteOrder = byteOrder;
markFirstByte = firstByte;
in.mark(readAheadLimit*BYTES_PER_CHAR + bytesForBOM);
} else {
throw new IOException("mark() not supported");
}
}
/**
* Reset the read ahead marks is not supported for UTF16 readers.
* @exception IOException is thrown, for all calls to this method
* because marking is not supported for UTF16 readers
*/
public void reset() throws IOException {
if (in.markSupported()) {
byteOrder = markByteOrder;
firstByte = markFirstByte;
in.reset();
} else {
throw new IOException("reset() not supported");
}
}
/**
* Get the size in chars of an array of bytes.
*
* @param array Source buffer
* @param offset Offset at which to start counting characters
* @param length number of bytes to use for counting
*
* @return number of characters that would be converted
*/
/*
* This method is only used by our internal Helper class in the method
* byteToCharArray to know how much to allocate before using a
* reader. If we encounter bad encoding we should return a count
* that includes that character so the reader will throw an IOException
*/
public int sizeOf(byte[] array, int offset, int length) {
int b1 = 0xff & array[0];
int b2 = 0xff & array[1];
if ((b1 == 0xfe && b2 == 0xff)
||(b1 == 0xff && b2 == 0xfe)){
// do not count BOM, it's not a part of data
return length/BYTES_PER_CHAR - 1;
}
return length/BYTES_PER_CHAR;
}
}