/*
* The Apache Software License, Version 1.1
*
*
* Copyright (c) 2000-2002 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.apache.org. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
//package org.apache.xerces.impl.io;
package org.geoserver.ows.util;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
/**
* Reader for UCS-2 and UCS-4 encodings.
* (more precisely ISO-10646-UCS-(2|4) encodings).
*
* This variant is modified to handle supplementary Unicode code points
* correctly. Though this required a lot of new code and definitely
* reduced the perfomance comparing to original version. I tried my best
* to preserve exsiting code and comments whenever it was possible.
* I performed some basic tests, but not too thorough ones, so
* some bugs may still nest in the code. -AK
*
* @author Neil Graham, IBM
*
* @version $Id$
*/
public class UCSReader extends Reader {
//
// Constants
//
/**
* Default byte buffer size (8192, larger than that of ASCIIReader
* since it's reasonable to surmise that the average UCS-4-encoded
* file should be 4 times as large as the average ASCII-encoded file).
*/
public static final int DEFAULT_BUFFER_SIZE = 8192;
/**
* Starting size of the internal char buffer. Internal char buffer is
* maintained to hold excess chars that may left from previous read
* operation when working with UCS-4 data (never used for UCS-2).
*/
public static final int CHAR_BUFFER_INITIAL_SIZE = 1024;
public static final short UCS2LE = 1;
public static final short UCS2BE = 2;
public static final short UCS4LE = 4;
public static final short UCS4BE = 8;
/**
* The minimum value of a supplementary code point.
*/
public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x010000;
/**
* The minimum value of a Unicode code point.
*/
public static final int MIN_CODE_POINT = 0x000000;
/**
* The maximum value of a Unicode code point.
*/
public static final int MAX_CODE_POINT = 0x10ffff;
//
// Data
//
/** Input stream. */
protected InputStream fInputStream;
/** Byte buffer. */
protected byte[] fBuffer;
/** what kind of data we're dealing with */
protected short fEncoding;
/**
* Stores aforeread or "excess" characters that may appear during
* <code>read</code> methods invocation due to the fact that one input
* UCS-4 supplementary character results in two output Java
* <code>char</code>`s - high surrogate and low surrogate code units.
* Because of that, if <code>read()</code> method encounters supplementary
* code point in the input stream, it returns UTF-16-encoded high surrogate
* code unit and stores low surrogate in buffer. When called next time,
* <code>read()</code> will return this low surrogate, instead of reading
* more bytes from the <code>InputStream</code>. Similarly if
* <code>read(char[], int, int)</code> is invoked to read, for example,
* 10 chars into specified buffer, and 4 of them turn out to
* be supplementary Unicode characters, each written as two chars, then we
* end up having 4 excess chars that we cannot immediately return or
* push back to the input stream. So we need to store them in the buffer
* awaiting further <code>read</code> invocations.
* Note that char buffer functions like a stack, i.e. chars and surrogate
* pairs are stored in reverse order.
*/
protected char[] fCharBuf;
/**
* Count of Java chars currently being stored in in the
* <code>fCharBuf</code> array.
*/
protected int fCharCount;
//
// Constructors
//
/**
* Constructs an <code>ISO-10646-UCS-(2|4)</code> reader from the specified
* input stream using default buffer size. The Endianness and exact input
* encoding (<code>UCS-2</code> or <code>UCS-4</code>) also should be known
* in advance.
*
* @param inputStream input stream with UCS-2|4 encoded data
* @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
*/
public UCSReader(InputStream inputStream, short encoding) {
this(inputStream, DEFAULT_BUFFER_SIZE, encoding);
} // <init>(InputStream, short)
/**
* Constructs an <code>ISO-10646-UCS-(2|4)</code> reader from the source
* input stream using explicitly specified initial buffer size. Endianness
* and exact input encoding (<code>UCS-2</code> or <code>UCS-4</code>) also
* should be known in advance.
*
* @param inputStream input stream with UCS-2|4 encoded data
* @param size The initial buffer size. You better make sure
* this number is divisible by 4 if you plan to
* to read UCS-4 with this class.
* @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE
*/
public UCSReader(InputStream inputStream, int size, short encoding) {
fInputStream = inputStream;
fBuffer = new byte[size];
fEncoding = encoding;
fCharBuf = new char[CHAR_BUFFER_INITIAL_SIZE];
fCharCount = 0;
} // <init>(InputStream, int, short)
//
// Reader methods
//
/**
* Read a single character. This method will block until a character is
* available, an I/O error occurs, or the end of the stream is reached.
*
* If supplementary Unicode character is encountered in <code>UCS-4</code>
* input, it will be encoded into <code>UTF-16</code> surrogate pair
* according to RFC 2781. High surrogate code unit will be returned
* immediately, and low surrogate saved in the internal buffer to be read
* during next <code>read()</code> or <code>read(char[], int, int)</code>
* invocation. -AK
*
* @return Java 16-bit <code>char</code> value containing UTF-16 code
* unit which may be either code point from Basic Multilingual
* Plane or one of the surrogate code units (high or low)
* of the pair representing supplementary Unicode character
* (one in <code>0x10000 - 0x10FFFF</code> range) -AK
*
* @exception IOException when I/O error occurs
*/
public int read() throws IOException {
// If we got something in the char buffer, let's use it.
if (0 != fCharCount) {
fCharCount--;
return ((int) fCharBuf[fCharCount]) & 0xFFFF;
}
int b0 = fInputStream.read() & 0xff; // 1st byte
if (b0 == 0xff) {
return -1;
}
int b1 = fInputStream.read() & 0xff; // 2nd byte
if (b1 == 0xff) {
return -1;
}
if (fEncoding >= 4) { // UCS-4
int b2 = fInputStream.read() & 0xff; // 3rd byte
if (b2 == 0xff) {
return -1;
}
int b3 = fInputStream.read() & 0xff; // 4th byte
if (b3 == 0xff) {
return -1;
}
int codepoint;
if (UCS4BE == fEncoding) {
codepoint = ((b0 << 24) + (b1 << 16) + (b2 << 8) + b3);
} else {
codepoint = ((b3 << 24) + (b2 << 16) + (b1 << 8) + b0);
}
/*
* Encoding from UCS-4 to UTF-16 as described in RFC 2781
* In theory there should be additional `isValidCodePoint()` check
* but I simply don't know what to do if invalid one is encountered.
*/
if (!isSupplementaryCodePoint(codepoint)) {
return codepoint;
} else {
int cp1 = (codepoint - 0x10000) & 0xFFFFF;
int highSurrogate = 0xD800 + (cp1 >>> 10); // ">>" should work too
// Saving low surrogate for future use
fCharBuf[fCharCount] = (char) (0xDC00 + (cp1 & 0x3FF));
// low surrogate code unit will be returned during next call
return highSurrogate;
}
} else { // UCS-2
if (fEncoding == UCS2BE) {
return (b0 << 8) + b1;
} else {
return (b1 << 8) + b0;
}
}
} // read():int
/**
* Read characters into a portion of an array. This method will block
* until some input is available, an I/O error occurs, or the end of the
* stream is reached.
*
* I suspect that the whole stuff works awfully slow, so if you know
* for sure that your <code>UCS-4</code> input does not contain any
* supplementary code points you probably should use original
* <code>UCSReader</code> class from Xerces team
* (<code>org.apache.xerces.impl.io.UCSReader</code>). -AK
*
* @param ch Destination buffer
* @param offset Offset at which to start storing characters
* @param length Maximum number of characters to read
*
* @return The number of characters read, or <code>-1</code> if the
* end of the stream has been reached. Note that this is not
* a number of <code>UCS-4</code> characters read, but
* instead number of <code>UTF-16</code> code units. These
* two are equal only if there were no supplementary Unicode
* code points among read chars.
*
* @exception IOException If an I/O error occurs
*/
public int read(char[] ch, int offset, int length)
throws IOException {
/*
* The behavior of this method is _intended_ to be like this:
*
* 1. In case if we are working with UCS-2 data, `readUCS2` method
* handles the stuff.
*
* 2. For UCS-4 data method first looks if there is some data stored in
* the internal character buffer (fCharBuf). Usually this data is
* left from previous reading operation if there were any
* supplementary Unicode (ISO-10646) characters.
*
* 3. If buffer holds something, these chars are put directly in passed
* `ch` buffer (maximum `length` of them).
*
* 4. If char buffer ends and more data can be put into `ch`,
* then they are read from the underlying byte stream.
*
* 5. Method tries to read maximum possible number of bytes from
* InputStream, as if all read code points were from BMP (Basic
* Multilingual Plane).
*
* 6. Read UCS-4 characters are encoded to UTF-16 (which is native Java
* encoding) ant put into `ch` array.
*
* 7. It is possible that we end up with more chars than we can
* currently put into passed buffer due to the fact that
* supplementary Unicode characters are encoded into _two_ Java
* char's each. In this situation excess chars are stored in the
* internal char buffer (in reverse order, i.e. those read last
* are at the beginning of the `fCharBuf`). They are usually picked
* up during next call(s) to one of the `read` methods.
*/
if ((0 > offset) || (offset > ch.length) || (0 > length) || ((offset + length) > ch.length)
|| (0 > (offset + length))) {
throw new IndexOutOfBoundsException();
} else if (0 == length) {
return 0;
}
/*
* Well, it is clear that the code should be separated for
* UCS-2 and UCS-4 now with all that char buffer stuff around.
* Things are already getting nasty.
*/
if (fEncoding < 4) {
return readUCS2(ch, offset, length);
}
// First using chars from internal char buffer (if any)
int charsRead = 0;
while (charsRead <= length) {
if (0 != fCharCount) {
ch[offset + charsRead] = fCharBuf[--fCharCount];
charsRead++;
} else {
break;
}
}
// Reading remaining chars from InputStream.
if (0 != (length - charsRead)) {
/*
* Each output char (two for supplementary characters) will require
* us to read 4 input bytes. But as we cannot predict how many
* supplementary chars we will encounter, so we should try to read
* maximum possible number.
*/
int byteLength = (length - charsRead) << 2;
if (byteLength > fBuffer.length) {
byteLength = fBuffer.length;
}
int count = fInputStream.read(fBuffer, 0, byteLength);
if (-1 == count) {
return (0 == charsRead) ? (-1) : charsRead;
} else {
// try and make count be a multiple of the number of bytes we're
// looking for (simply reading 1 to 3 bytes from input stream to
// ensure the last code point is complete)
// this looks ugly, but it avoids an if at any rate...
int numToRead = ((4 - (count & 3)) & 3);
for (int i = 0; i < numToRead; i++) {
int charRead = fInputStream.read();
if (charRead == -1) {
// end of input; something likely went wrong! Pad buffer
// with zeros.
for (int j = i; j < numToRead; j++)
fBuffer[count + j] = 0;
break;
} else {
fBuffer[count + i] = (byte) charRead;
}
}
count += numToRead;
// now count is a multiple of the right number of bytes
int numChars = count >> 2;
int curPos = 0;
/*
* `i` is index of currently processed char from InputStream.
* `charsCount` also counts number of chars that were (possibly)
* read from internal char buffer.
*/
int charsCount = charsRead;
int i;
for (i = 0; (i < numChars) && (length >= charsCount); i++) {
int b0 = fBuffer[curPos++] & 0xff;
int b1 = fBuffer[curPos++] & 0xff;
int b2 = fBuffer[curPos++] & 0xff;
int b3 = fBuffer[curPos++] & 0xff;
int codepoint;
if (UCS4BE == fEncoding) {
codepoint = ((b0 << 24) + (b1 << 16) + (b2 << 8) + b3);
} else {
codepoint = ((b3 << 24) + (b2 << 16) + (b1 << 8) + b0);
}
// Again, validity of this codepoint is never checked, this
// can yield problems sometimes.
if (!isSupplementaryCodePoint(codepoint)) {
ch[offset + charsCount] = (char) codepoint;
charsCount++;
} else {
// Checking if we can put another 2 chars in buffer.
if (2 <= (length - charsCount)) {
int cp1 = (codepoint - 0x10000) & 0xFFFFF;
ch[offset + charsCount] = (char) (0xD800 + (cp1 >>> 10));
ch[offset + charsCount + 1] = (char) (0xDC00 + (cp1 & 0x3FF));
charsCount += 2;
} else {
break; // END for
}
}
} // END for
// Storing data, that possibly remain in `fBuffer` into internal
// char buffer for future use :)
curPos = (numChars << 2) - 1;
for (int k = numChars; k > i; k--) {
// Reading bytes in reverse order
int b3 = fBuffer[curPos--] & 0xff;
int b2 = fBuffer[curPos--] & 0xff;
int b1 = fBuffer[curPos--] & 0xff;
int b0 = fBuffer[curPos--] & 0xff;
int codepoint;
if (UCS4BE == fEncoding) {
codepoint = ((b0 << 24) + (b1 << 16) + (b2 << 8) + b3);
} else {
codepoint = ((b3 << 24) + (b2 << 16) + (b1 << 8) + b0);
}
// Look if we need to increase buffer size
if (2 > (fCharBuf.length - k)) {
char[] newBuf = new char[fCharBuf.length << 1];
System.arraycopy(fCharBuf, 0, newBuf, 0, fCharBuf.length);
fCharBuf = newBuf;
}
if (!isSupplementaryCodePoint(codepoint)) {
fCharBuf[fCharCount++] = (char) codepoint;
} else {
int cp1 = (codepoint - 0x10000) & 0xFFFFF;
// In this case store low surrogate code unit first, so that
// it can be read back after high one.
fCharBuf[fCharCount++] = (char) (0xDC00 + ((char) cp1 & 0x3FF));
fCharBuf[fCharCount++] = (char) (0xD800 + (cp1 >>> 10));
}
} // END for
return charsCount;
} // END if (-1 == count) ELSE
} // END if (0 != (length - charsRead))
return charsRead;
} // read(char[],int,int)
/**
* Read <code>UCS-2</code> characters into a portion of an array.
* This method will block until some input is available, an I/O
* error occurs, or the end of the stream is reached.
* <p>
* In original <code>UCSReader</code> this code was part of
* <code>read(char[], int, int)</code> method, but I removed it
* from there to reduce complexity of the latter.
* </p>
*
* @param ch destination buffer
* @param offset offset at which to start storing characters
* @param length maximum number of characters to read
*
* @return The number of characters read, or <code>-1</code>
* if the end of the stream has been reached
*
* @exception IOException If an I/O error occurs
*/
protected int readUCS2(char[] ch, int offset, int length)
throws IOException {
int byteLength = length << 1;
if (byteLength > fBuffer.length) {
byteLength = fBuffer.length;
}
int count = fInputStream.read(fBuffer, 0, byteLength);
if (count == -1) {
return -1;
}
// try and make count be a multiple of the number of bytes we're
// looking for (simply reading 1 to 3 bytes from input stream to
// ensure the last code point is complete)
int numToRead = count & 1;
if (numToRead != 0) {
count++;
int charRead = fInputStream.read();
if (charRead == -1) { // end of input; something likely went
// wrong! Pad buffer with nulls.
fBuffer[count] = 0;
} else {
fBuffer[count] = (byte) charRead;
}
}
// now count is a multiple of the right number of bytes
int numChars = count >> 1;
int curPos = 0;
for (int i = 0; i < numChars; i++) {
int b0 = fBuffer[curPos++] & 0xff;
int b1 = fBuffer[curPos++] & 0xff;
if (fEncoding == UCS2BE) {
ch[offset + i] = (char) ((b0 << 8) + b1);
} else {
ch[offset + i] = (char) ((b1 << 8) + b0);
}
}
return numChars;
} // END readUCS2(char[], int, int)
/**
* Skip characters. This method will block until some characters are
* available, an I/O error occurs, or the end of the stream is reached.
*
* @param n The number of characters to skip
*
* @return The number of characters actually skipped
*
* @exception IOException If an I/O error occurs
*/
public long skip(long n) throws IOException {
/*
* charWidth will represent the number of bits to move
* n leftward to get num of bytes to skip, and then move the result
* rightward
* to get num of chars effectively skipped.
* The trick with &'ing, as with elsewhere in this dcode, is
* intended to avoid an expensive use of / that might not be optimized
* away.
*/
int charWidth = (fEncoding >= 4) ? 2 : 1;
long bytesSkipped = fInputStream.skip(n << charWidth);
if ((bytesSkipped & (charWidth | 1)) == 0) {
return bytesSkipped >>> charWidth;
}
return (bytesSkipped >>> charWidth) + 1;
} // skip(long):long
/**
* Tell whether this stream is ready to be read.
*
* @return True if the next read() is guaranteed not to block for input,
* false otherwise. Note that returning false does not guarantee that the
* next read will block.
*
* @exception IOException If an I/O error occurs
*/
public boolean ready() throws IOException {
return false;
} // ready()
/**
* Tell whether this stream supports the mark() operation.
*/
public boolean markSupported() {
return fInputStream.markSupported();
} // markSupported()
/**
* Mark the present position in the stream. Subsequent calls to
* <code>reset</code> will attempt to reposition the stream to this point.
* Not all character-input streams support the <code>mark</code> operation.
* This is one of them :) It relies on marking facilities of underlying
* byte stream.
*
* @param readAheadLimit Limit on the number of characters that may be
* read while still preserving the mark. After
* reading this many characters, attempting to
* reset the stream may fail.
*
* @exception IOException If the stream does not support
* <code>mark</code>, or if some other I/O error
* occurs
*/
public void mark(int readAheadLimit) throws IOException {
fInputStream.mark(readAheadLimit);
} // mark(int)
/**
* Reset the stream. If the stream has been marked, then attempt to
* reposition it at the mark. If the stream has not been marked, then
* attempt to reset it in some way appropriate to the particular stream,
* for example by repositioning it to its starting point. This stream
* implementation does not support <code>mark</code>/<code>reset</code>
* by itself, it relies on underlying byte stream in this matter.
*
* @exception IOException If the stream has not been marked,
* or if the mark has been invalidated,
* or if the stream does not support reset(),
* or if some other I/O error occurs
*/
public void reset() throws IOException {
fInputStream.reset();
} // reset()
/**
* Close the stream. Once a stream has been closed, further
* <code>read</code>, <code>ready</code>, <code>mark</code>,
* or <code>reset</code> invocations will throw an IOException.
* Closing a previously-closed stream, however, has no effect.
*
* @exception IOException If an I/O error occurs
*/
public void close() throws IOException {
fInputStream.close();
fInputStream = null;
fCharBuf = null;
fBuffer = null;
} // close()
/**
* Returns the encoding currently in use by this character stream.
*
* @return Encoding of this stream. Either ISO-10646-UCS-2 or
* ISO-10646-UCS-4. Problem is that this string doesn't indicate
* the byte order of that encoding. What to do, then? Unlike
* UTF-16 byte order cannot be made part of the encoding name
* in this case and still can be critical. Currently you can
* find out the byte order by invoking <code>getByteOrder</code>
* method.
*/
public String getEncoding() {
if (4 > fEncoding) {
return "ISO-10646-UCS-2";
} else {
return "ISO-10646-UCS-4";
}
}
/**
* Returns byte order ("endianness") of the encoding currently in use by
* this character stream. This is a string with two possible values:
* <code>LITTLE_ENDIAN</code> and <code>BIG_ENDIAN</code>. Maybe using
* a named constant is a better alternative, but I just don't like them.
* But feel free to change this behavior if you think that would be
* better.
*
* @return <code>LITTLE_ENDIAN</code> or <code>BIG_ENDIAN</code> depending
* on byte order of current encoding of this stream.
*/
public String getByteOrder() {
if ((1 == fEncoding) || (4 == fEncoding)) {
return "LITTLE_ENDIAN";
} else {
return "BIG_ENDIAN";
}
}
/**
* Determines whether the specified character (Unicode code point)
* is in the supplementary character range. The method call is
* equivalent to the expression:
* <blockquote><pre>
* codePoint >= 0x10000 && codePoint <= 0x10ffff
* </pre></blockquote>
*
* Stolen from JDK 1.5 <code>java.lang.Character</code> class in
* order to provide JDK 1.4 compatibility.
*
* @param codePoint the character (Unicode code point) to be tested
* @return <code>true</code> if the specified character is in the Unicode
* supplementary character range; <code>false</code> otherwise.
*/
protected boolean isSupplementaryCodePoint(int codePoint) {
return (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) && (codePoint <= MAX_CODE_POINT);
}
} // class UCSReader