/* * Copyright 1990-2009 Sun Microsystems, Inc. All Rights Reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 only, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License version 2 for more details (a copy is * included at /legal/license.txt). * * You should have received a copy of the GNU General Public License * version 2 along with this work; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA * * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa * Clara, CA 95054 or visit www.sun.com if you need additional * information or have any questions. */ package com.sun.ukit.io; import java.io.ByteArrayInputStream; import java.io.Reader; import java.io.InputStream; import java.io.IOException; import java.io.EOFException; import java.io.UTFDataFormatException; import java.io.UnsupportedEncodingException; import java.util.Vector; /** * UTF-8 transformed UCS-2 character stream reader. * * This reader converts UTF-8 transformed UCS-2 characters to Java characters. * The UCS-2 subset of UTF-8 transformation is described in RFC-2279 #2 * "UTF-8 definition": * 0000 0000-0000 007F 0xxxxxxx * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * * This reader will return incorrect last character on broken UTF-8 stream. */ public class ReaderUTF8 extends Reader { private static final int maxBytesInUTF8Character = 3; private InputStream is; final private byte[] buff = new byte[128]; private int bidx = 0; private int bcnt = 0; /** * Constructor. * * @param is A byte input stream. */ public ReaderUTF8(InputStream is) { this.is = is; } /** * Fills buffer with data from the source InputStream * @throws IOException */ private void fillBuffer() throws IOException { bcnt -= bidx; if( bcnt > 0 ) System.arraycopy(buff, bidx, buff, 0, bcnt); bidx = 0; while( buff.length - bcnt > 0 ){ int count = is.read(buff, bcnt, buff.length - bcnt); if( count < 0 ) break; bcnt += count; } //System.out.print("fillBuffer[" + bcnt + "]: "); //for( int o = bidx; o < bcnt; o++) System.out.print( (char)buff[o] ); //System.out.println(); } /** * Reads characters into a portion of an array. This method uses internal * buffer. * * @param cbuf Destination buffer. * @param off Offset at which to start storing characters. * @param len Maximum number of characters to read. * @exception IOException If any IO errors occur. * @exception UnsupportedEncodingException If UCS-4 character occur in the * stream. */ public int read(char[] cbuf, int off, int len) throws IOException { int num = 0; while (num < len) { if (bcnt - bidx < maxBytesInUTF8Character){ //System.out.println( "bcnt = " + bcnt + ", bidx = " + bidx + ", num = " + num ); if( bidx > bcnt ) break; fillBuffer(); if( bcnt - bidx == 0 ){ if( num == 0 ) return -1; break; } } char val = (char)buff[bidx++]; if (val <= 0x7f) { cbuf[off++] = val; } else { switch (val & 0xf0) { case 0xc0: case 0xd0: cbuf[off++] = (char)(((val & 0x1f) << 6) | (buff[bidx++] & 0x3f)); break; case 0xe0: cbuf[off++] = (char)(((val & 0x0f) << 12) | ((buff[bidx++] & 0x3f) << 6) | (buff[bidx++] & 0x3f)); break; case 0xf0: // UCS-4 character throw new UnsupportedEncodingException(); default: throw new UTFDataFormatException(); } } num++; } if( bidx > bcnt ){ // last input UTF-8 character is wrong bidx = bcnt = 0; throw new EOFException(); } //System.out.print( "ReaderUTF8.read[" + num + "]:" ); //for( int o = off - num; o < off; o++) System.out.print( cbuf[o] ); //System.out.println(); return num; } /** * Reads a single character. This method does not use internal buffer. * * @return The character read, as an integer in the range 0 to 65535 * (0x00-0xffff), or -1 if the end of the stream has been reached. * @exception IOException If any IO errors occur. * @exception UnsupportedEncodingException If UCS-4 character occur in the * stream. */ public int read() throws IOException { if (bcnt - bidx < maxBytesInUTF8Character){ fillBuffer(); if( bcnt - bidx == 0 ) return -1; } int val = buff[bidx++]; if (val > 0x7f) { switch (val & 0xf0) { case 0xc0: case 0xd0: val = ((val & 0x1f) << 6) | (buff[bidx++] & 0x3f); break; case 0xe0: val = ((val & 0x0f) << 12) | ((buff[bidx++] & 0x3f) << 6) | (buff[bidx++] & 0x3f); break; case 0xf0: // UCS-4 character throw new UnsupportedEncodingException(); default: throw new UTFDataFormatException(); } } if( bidx > bcnt ){ // last input UTF-8 character is wrong bidx = bcnt = 0; throw new EOFException(); } //System.out.println( "ReaderUTF8.read: " + (char)val ); return val; } /** * Closes the stream. * * @exception IOException If any IO errors occur. */ public void close() throws IOException { is.close(); } public InputStream getByteStream() { InputStream result = is; if( bidx < bcnt ){ MultiInputStream r = new MultiInputStream(); r.add( new ByteArrayInputStream( buff, bidx, bcnt - bidx ) ); r.add(is); result = r; } is = null; return result; } static class MultiInputStream extends InputStream { protected Vector list = new Vector(); public void add( InputStream is ){ list.addElement(is); } public int read(byte[] b, int off, int len) throws IOException { int count = -1; while( list.size() != 0 && (count = ((InputStream)list.firstElement()).read(b, off, len)) < 0 ) list.removeElementAt(0); return count; } final private byte[] buff = new byte[1]; public int read() throws IOException { if( read( buff, 0, 1 ) < 0 ) return -1; return buff[0] & 0xFF; } } }