/* * Javolution - Java(TM) Solution for Real-Time and Embedded Systems * Copyright (C) 2012 - Javolution (http://javolution.org/) * All rights reserved. * * Permission to use, copy, modify, and distribute this software is * freely granted, provided that this notice is preserved. */ package javolution.io; import java.io.CharConversionException; import java.io.IOException; import java.io.Reader; import java.nio.BufferUnderflowException; import java.nio.ByteBuffer; /** * <p> A UTF-8 <code>java.nio.ByteBuffer</code> reader. * </p> * * <p> This reader can be used for efficient decoding of native byte * buffers (e.g. <code>MappedByteBuffer</code>), high-performance * messaging (no intermediate buffer), etc.</p> * * <p> This reader supports surrogate <code>char</code> pairs (representing * characters in the range [U+10000 .. U+10FFFF]). It can also be used * to read characters unicodes (31 bits) directly * (ref. {@link #read()}).</p> * * <p> Each invocation of one of the <code>read()</code> methods may cause one * or more bytes to be read from the underlying byte buffer. * The end of stream is reached when the byte buffer position and limit * coincide.</p> * * @author <a href="mailto:jean-marie@dautelle.com">Jean-Marie Dautelle</a> * @version 2.0, December 9, 2004 * @see UTF8ByteBufferWriter */ public final class UTF8ByteBufferReader extends Reader { /** * Holds the byte buffer source. */ private ByteBuffer _byteBuffer; /** * Default constructor. */ public UTF8ByteBufferReader() {} /** * Sets the <code>ByteBuffer</code> to use for reading available bytes * from current buffer position. * * @param byteBuffer the <code>ByteBuffer</code> source. * @return this UTF-8 reader. * @throws IllegalStateException if this reader is being reused and * it has not been {@link #close closed} or {@link #reset reset}. */ public UTF8ByteBufferReader setInput(ByteBuffer byteBuffer) { if (_byteBuffer != null) throw new IllegalStateException("Reader not closed or reset"); _byteBuffer = byteBuffer; return this; } /** * Indicates if this stream is ready to be read. * * @return <code>true</code> if the byte buffer has remaining bytes to * read; <code>false</code> otherwise. * @throws IOException if an I/O error occurs. */ public boolean ready() throws IOException { if (_byteBuffer != null) { return _byteBuffer.hasRemaining(); } else { throw new IOException("Reader closed"); } } /** * Closes and {@link #reset resets} this reader for reuse. * * @throws IOException if an I/O error occurs. */ public void close() throws IOException { if (_byteBuffer != null) { reset(); } } /** * Reads a single character. This method does not block, <code>-1</code> * is returned if the buffer's limit has been reached. * * @return the 31-bits Unicode of the character read, or -1 if there is * no more remaining bytes to be read. * @throws IOException if an I/O error occurs (e.g. incomplete * character sequence being read). */ public int read() throws IOException { if (_byteBuffer != null) { if (_byteBuffer.hasRemaining()) { byte b = _byteBuffer.get(); return (b >= 0) ? b : read2(b); } else { return -1; } } else { throw new IOException("Reader closed"); } } // Reads one full character, throws CharConversionException if limit reached. private int read2(byte b) throws IOException { try { // Decodes UTF-8. if ((b >= 0) && (_moreBytes == 0)) { // 0xxxxxxx return b; } else if (((b & 0xc0) == 0x80) && (_moreBytes != 0)) { // 10xxxxxx (continuation byte) _code = (_code << 6) | (b & 0x3f); // Adds 6 bits to code. if (--_moreBytes == 0) { return _code; } else { return read2(_byteBuffer.get()); } } else if (((b & 0xe0) == 0xc0) && (_moreBytes == 0)) { // 110xxxxx _code = b & 0x1f; _moreBytes = 1; return read2(_byteBuffer.get()); } else if (((b & 0xf0) == 0xe0) && (_moreBytes == 0)) { // 1110xxxx _code = b & 0x0f; _moreBytes = 2; return read2(_byteBuffer.get()); } else if (((b & 0xf8) == 0xf0) && (_moreBytes == 0)) { // 11110xxx _code = b & 0x07; _moreBytes = 3; return read2(_byteBuffer.get()); } else if (((b & 0xfc) == 0xf8) && (_moreBytes == 0)) { // 111110xx _code = b & 0x03; _moreBytes = 4; return read2(_byteBuffer.get()); } else if (((b & 0xfe) == 0xfc) && (_moreBytes == 0)) { // 1111110x _code = b & 0x01; _moreBytes = 5; return read2(_byteBuffer.get()); } else { throw new CharConversionException("Invalid UTF-8 Encoding"); } } catch (BufferUnderflowException e) { throw new CharConversionException("Incomplete Sequence"); } } private int _code; private int _moreBytes; /** * Reads characters into a portion of an array. This method does not * block. * * <p> Note: Characters between U+10000 and U+10FFFF are represented * by surrogate pairs (two <code>char</code>).</p> * * @param cbuf the destination buffer. * @param off the offset at which to start storing characters. * @param len the maximum number of characters to read * @return the number of characters read, or -1 if there is no more * byte remaining. * @throws IOException if an I/O error occurs. */ public int read(char cbuf[], int off, int len) throws IOException { if (_byteBuffer == null) throw new IOException("Reader closed"); final int off_plus_len = off + len; int remaining = _byteBuffer.remaining(); if (remaining <= 0) return -1; for (int i = off; i < off_plus_len;) { if (remaining-- > 0) { byte b = _byteBuffer.get(); if (b >= 0) { cbuf[i++] = (char) b; // Most common case. } else { if (i < off_plus_len - 1) { // Up to two 'char' can be read. int code = read2(b); remaining = _byteBuffer.remaining(); // Recalculates. if (code < 0x10000) { cbuf[i++] = (char) code; } else if (code <= 0x10ffff) { // Surrogates. cbuf[i++] = (char) (((code - 0x10000) >> 10) + 0xd800); cbuf[i++] = (char) (((code - 0x10000) & 0x3ff) + 0xdc00); } else { throw new CharConversionException( "Cannot convert U+" + Integer.toHexString(code) + " to char (code greater than U+10FFFF)"); } } else { // Not enough space in destination (go back). _byteBuffer.position(_byteBuffer.position() - 1); remaining++; return i - off; } } } else { return i - off; } } return len; } /** * Reads characters into the specified appendable. This method does not * block. * * <p> Note: Characters between U+10000 and U+10FFFF are represented * by surrogate pairs (two <code>char</code>).</p> * * @param dest the destination buffer. * @throws IOException if an I/O error occurs. */ public void read(Appendable dest) throws IOException { if (_byteBuffer == null) throw new IOException("Reader closed"); while (_byteBuffer.hasRemaining()) { byte b = _byteBuffer.get(); if (b >= 0) { dest.append((char) b); // Most common case. } else { int code = read2(b); if (code < 0x10000) { dest.append((char) code); } else if (code <= 0x10ffff) { // Surrogates. dest.append((char) (((code - 0x10000) >> 10) + 0xd800)); dest.append((char) (((code - 0x10000) & 0x3ff) + 0xdc00)); } else { throw new CharConversionException("Cannot convert U+" + Integer.toHexString(code) + " to char (code greater than U+10FFFF)"); } } } } public void reset() { _byteBuffer = null; _code = 0; _moreBytes = 0; } /** * @deprecated Replaced by {@link #setInput(ByteBuffer)} */ public UTF8ByteBufferReader setByteBuffer(ByteBuffer byteBuffer) { return this.setInput(byteBuffer); } }