package net.i2p.sam; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CoderResult; /** * An unbuffered version of InputStreamReader. * * Does not read any extra characters, as long as input is well-formed. * This permits the partial reading of an InputStream as UTF-8 * and then passing the remainder of the input stream elsewhere. * This isn't the most robust for malformed input, so it * may not be appropriate for e.g. HTTP headers. * * Not thread-safe, obviously. * * May be moved to net.i2p.util if anybody else needs it. * * @since 0.9.24 somewhat adapted from net.i2p.util.TranslateReader */ public class UTF8Reader extends Reader { private final InputStream _in; // following three are lazily initialized when needed private ByteBuffer _bb; private CharBuffer _cb; private CharsetDecoder _dc; // Charset.forName("UTF-8").newDecoder().replacement().charAt(0) & 0xffff private static final int REPLACEMENT = 0xfffd; /** * @param in UTF-8 */ public UTF8Reader(InputStream in) { super(); _in = in; } /** * @return replacement character on decoding error */ @Override public int read() throws IOException { int b = _in.read(); if (b < 0) return b; // https://en.wikipedia.org/wiki/Utf-8 if ((b & 0x80) == 0) return b; if (_bb == null) { _bb = ByteBuffer.allocate(6); _cb = CharBuffer.allocate(1); _dc = Charset.forName("UTF-8").newDecoder(); } else { _bb.clear(); _cb.clear(); } _bb.put((byte) b); int end; // how many more if ((b & 0xe0) == 0xc0) end = 1; else if ((b & 0xf0) == 0xe0) end = 2; else if ((b & 0xf8) == 0xf0) end = 3; else if ((b & 0xfc) == 0xf8) end = 4; else if ((b & 0xfe) == 0xfc) end = 5; else // error, 10xxxxxx return REPLACEMENT; for (int i = 0; i < end; i++) { b = _in.read(); if (b < 0) return REPLACEMENT; // next read will return EOF // we aren't going to check for all errors, // but let's fail fast on this one if ((b & 0x80) == 0) return REPLACEMENT; _bb.put((byte) b); } _dc.reset(); _bb.flip(); CoderResult result = _dc.decode(_bb, _cb, true); // Overflow and underflow are not errors. // It seems to return underflow every time. // So just check if we got a character back in the buffer. _cb.flip(); if (result.isError() || !_cb.hasRemaining()) return REPLACEMENT; // let underflow and overflow go, return first return _cb.get() & 0xffff; } @Override public int read(char cbuf[]) throws IOException { return read(cbuf, 0, cbuf.length); } public int read(char cbuf[], int off, int len) throws IOException { for (int i = 0; i < len; i++) { int c = read(); if (c < 0) { if (i == 0) return -1; return i; } cbuf[off + i] = (char) c; } return len; } public void close() throws IOException { _in.close(); } /**** public static void main(String[] args) { try { String s = "Consider the encoding of the Euro sign, €." + " The Unicode code point for \"€\" is U+20AC."; byte[] test = s.getBytes("UTF-8"); InputStream bais = new java.io.ByteArrayInputStream(test); UTF8Reader r = new UTF8Reader(bais); int b; StringBuilder buf = new StringBuilder(128); while ((b = r.read()) >= 0) { buf.append((char) b); } System.out.println("Received: " + buf); System.out.println("Test passed? " + buf.toString().equals(s)); buf.setLength(0); bais = new java.io.ByteArrayInputStream(new byte[] { 'x', (byte) 0xcc, 'x' } ); r = new UTF8Reader(bais); while ((b = r.read()) >= 0) { buf.append((char) b); } System.out.println("Received: " + buf); } catch (IOException ioe) { ioe.printStackTrace(); } } ****/ }