UTF8Reader.java example

Explorer
i2p.i2p-master
package net.i2p.sam;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;


/**
 *  An unbuffered version of InputStreamReader.
 *
 *  Does not read any extra characters, as long as input is well-formed.
 *  This permits the partial reading of an InputStream as UTF-8
 *  and then passing the remainder of the input stream elsewhere.
 *  This isn't the most robust for malformed input, so it
 *  may not be appropriate for e.g. HTTP headers.
 *
 *  Not thread-safe, obviously.
 *
 *  May be moved to net.i2p.util if anybody else needs it.
 *
 *  @since 0.9.24 somewhat adapted from net.i2p.util.TranslateReader
 */
public class UTF8Reader extends Reader {

    private final InputStream _in;
    // following three are lazily initialized when needed
    private ByteBuffer _bb;
    private CharBuffer _cb;
    private CharsetDecoder _dc;

    // Charset.forName("UTF-8").newDecoder().replacement().charAt(0) & 0xffff
    private static final int REPLACEMENT = 0xfffd;

    /**
     *  @param in UTF-8
     */
    public UTF8Reader(InputStream in) {
        super();
        _in = in;
    }

    /**
     *  @return replacement character on decoding error
     */
    @Override
    public int read() throws IOException {
        int b = _in.read();
        if (b < 0)
            return b;
        // https://en.wikipedia.org/wiki/Utf-8
        if ((b & 0x80) == 0)
            return b;
        if (_bb == null) {
            _bb = ByteBuffer.allocate(6);
            _cb = CharBuffer.allocate(1);
            _dc = Charset.forName("UTF-8").newDecoder();
        } else {
            _bb.clear();
            _cb.clear();
        }
        _bb.put((byte) b);
        int end;  // how many more
        if ((b & 0xe0) == 0xc0)
            end = 1;
        else if ((b & 0xf0) == 0xe0)
            end = 2;
        else if ((b & 0xf8) == 0xf0)
            end = 3;
        else if ((b & 0xfc) == 0xf8)
            end = 4;
        else if ((b & 0xfe) == 0xfc)
            end = 5;
        else  //  error, 10xxxxxx
            return REPLACEMENT;
        for (int i = 0; i < end; i++) {
            b = _in.read();
            if (b < 0)
                return REPLACEMENT;  // next read will return EOF
            // we aren't going to check for all errors,
            // but let's fail fast on this one
            if ((b & 0x80) == 0)
                return REPLACEMENT;
            _bb.put((byte) b);
        }
        _dc.reset();
        _bb.flip();
        CoderResult result = _dc.decode(_bb, _cb, true);
        // Overflow and underflow are not errors.
        // It seems to return underflow every time.
        // So just check if we got a character back in the buffer.
        _cb.flip();
        if (result.isError() || !_cb.hasRemaining())
            return REPLACEMENT;
        // let underflow and overflow go, return first
        return _cb.get() & 0xffff;
    }

    @Override
    public int read(char cbuf[]) throws IOException {
        return read(cbuf, 0, cbuf.length);
    }

    public int read(char cbuf[], int off, int len) throws IOException {
        for (int i = 0; i < len; i++) {
            int c = read();
            if (c < 0) {
                if (i == 0)
                    return -1;
                return i;
            }
            cbuf[off + i] = (char) c;
        }
        return len;
    }

    public void close() throws IOException {
        _in.close();
    }

/****
    public static void main(String[] args) {
        try {
            String s = "Consider the encoding of the Euro sign, €." +
                       " The Unicode code point for \"€\" is U+20AC.";
            byte[] test = s.getBytes("UTF-8");
            InputStream bais = new java.io.ByteArrayInputStream(test);
            UTF8Reader r = new UTF8Reader(bais);
            int b;
            StringBuilder buf = new StringBuilder(128);
            while ((b = r.read()) >= 0) {
                buf.append((char) b);
            }
            System.out.println("Received: " + buf);
            System.out.println("Test passed? " + buf.toString().equals(s));
            buf.setLength(0);
            bais = new java.io.ByteArrayInputStream(new byte[] { 'x', (byte) 0xcc, 'x' } );
            r = new UTF8Reader(bais);
            while ((b = r.read()) >= 0) {
                buf.append((char) b);
            }
            System.out.println("Received: " + buf);
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
    }
****/
}