UnicodeInputStream.java example

Explorer
bboss-master
package bboss.org.apache.velocity.io;

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */


import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;

import bboss.org.apache.velocity.util.ExceptionUtils;


/**
 * This is an input stream that is unicode BOM aware. This allows you to e.g. read
 * Windows Notepad Unicode files as Velocity templates.
 *
 * It allows you to check the actual encoding of a file by calling {@link #getEncodingFromStream()} on
 * the input stream reader.
 *
 * This class is not thread safe! When more than one thread wants to use an instance of UnicodeInputStream,
 * the caller must provide synchronization.
 *
 * @author <a href="mailto:mailmur@yahoo.com">Aki Nieminen</a>
 * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
 * @version $Id: UnicodeInputStream.java 685685 2008-08-13 21:43:27Z nbubna $
 * @since 1.5
 */
public class UnicodeInputStream
    extends InputStream
{

    /** BOM Marker for UTF 8. See http://www.unicode.org/unicode/faq/utf_bom.html */
    public static final UnicodeBOM UTF8_BOM = new UnicodeBOM("UTF-8", new byte [] { (byte)0xef, (byte)0xbb, (byte)0xbf });

    /** BOM Marker for UTF 16, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html */
    public static final UnicodeBOM UTF16LE_BOM = new UnicodeBOM("UTF-16LE", new byte [] { (byte)0xff, (byte)0xfe });

    /** BOM Marker for UTF 16, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html */
    public static final UnicodeBOM UTF16BE_BOM = new UnicodeBOM("UTF-16BE", new byte [] { (byte)0xfe, (byte)0xff });

    /**
     * BOM Marker for UTF 32, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html
     *
     * TODO: Does Java actually support this?
     */
    public static final UnicodeBOM UTF32LE_BOM = new UnicodeBOM("UTF-32LE", new byte [] { (byte)0xff, (byte)0xfe, (byte)0x00, (byte)0x00 });

    /**
     * BOM Marker for UTF 32, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html
     *
     * TODO: Does Java actually support this?
     */
    public static final UnicodeBOM UTF32BE_BOM = new UnicodeBOM("UTF-32BE", new byte [] { (byte)0x00, (byte)0x00, (byte)0xfe, (byte)0xff });

    /** The maximum amount of bytes to read for a BOM */
    private static final int MAX_BOM_SIZE = 4;

    /** Buffer for BOM reading */
    private byte [] buf = new byte[MAX_BOM_SIZE];

    /** Buffer pointer. */
    private int pos = 0;

    /** The stream encoding as read from the BOM or null. */
    private final String encoding;

    /** True if the BOM itself should be skipped and not read. */
    private final boolean skipBOM;

    private final PushbackInputStream inputStream;

    /**
     * Creates a new UnicodeInputStream object. Skips a BOM which defines the file encoding.
     *
     * @param  inputStream The input stream to use for reading.
     */
    public UnicodeInputStream(final InputStream inputStream)
            throws IllegalStateException, IOException
    {
        this(inputStream, true);
    }

    /**
     * Creates a new UnicodeInputStream object.
     *
     * @param  inputStream The input stream to use for reading.
     * @param skipBOM If this is set to true, a BOM read from the stream is discarded. This parameter should normally be true.
     */
    public UnicodeInputStream(final InputStream inputStream, boolean skipBOM)
            throws IllegalStateException, IOException
    {
        super();

        this.skipBOM = skipBOM;
        this.inputStream = new PushbackInputStream(inputStream, MAX_BOM_SIZE);

        try
        {
            this.encoding = readEncoding();
        }
        catch (IOException ioe)
        {
            IllegalStateException ex = new IllegalStateException("Could not read BOM from Stream");
            ExceptionUtils.setCause(ex, ioe);
            throw ex;
        }
    }

    /**
     * Returns true if the input stream discards the BOM.
     *
     * @return  True if the input stream discards the BOM.
     */
    public boolean isSkipBOM()
    {
        return skipBOM;
    }

    /**
     * Read encoding based on BOM.
     *
     * @return  The encoding based on the BOM.
     *
     * @throws  IllegalStateException  When a problem reading the BOM occured.
     */
    public String getEncodingFromStream()
    {
        return encoding;
    }

    /**
     * This method gets the encoding from the stream contents if a BOM exists. If no BOM exists, the encoding
     * is undefined.
     *
     * @return The encoding of this streams contents as decided by the BOM or null if no BOM was found.
     */
    protected String readEncoding()
        throws IOException
    {
        pos = 0;

        UnicodeBOM encoding = null;

        // read first byte.
        if (readByte())
        {
            // Build a list of matches
            //
            // 00 00 FE FF --> UTF 32 BE
            // EF BB BF    --> UTF 8
            // FE FF       --> UTF 16 BE
            // FF FE       --> UTF 16 LE
            // FF FE 00 00 --> UTF 32 LE

            switch (buf[0])
            {
            case (byte)0x00: // UTF32 BE
                encoding = match(UTF32BE_BOM, null);
                break;
            case (byte)0xef: // UTF8
                encoding = match(UTF8_BOM, null);
                break;
            case (byte)0xfe: // UTF16 BE
                encoding = match(UTF16BE_BOM, null);
                break;
            case (byte)0xff: // UTF16/32 LE
                encoding = match(UTF16LE_BOM, null);

                if (encoding != null)
                {
                    encoding = match(UTF32LE_BOM, encoding);
                }
                break;

            default:
                encoding = null;
                break;
            }
        }

        pushback(encoding);

        return (encoding != null) ? encoding.getEncoding() : null;
    }

    private final UnicodeBOM match(final UnicodeBOM matchEncoding, final UnicodeBOM noMatchEncoding)
        throws IOException
    {
        byte [] bom = matchEncoding.getBytes();

        for (int i = 0; i < bom.length; i++)
        {
            if (pos <= i) // Byte has not yet been read
            {
                if (!readByte())
                {
                    return noMatchEncoding;
                }
            }

            if (bom[i] != buf[i])
            {
                return noMatchEncoding;
            }
        }

        return matchEncoding;
    }

    private final boolean readByte()
            throws IOException
    {
        int res = inputStream.read();
        if (res == -1)
        {
            return false;
        }

        if (pos >= buf.length)
        {
            throw new IOException("BOM read error");
        }

        buf[pos++] = (byte) res;
        return true;
    }

    private final void pushback(final UnicodeBOM matchBOM)
        throws IOException
    {
        int count = pos; // By default, all bytes are pushed back.
        int start = 0;

        if (matchBOM != null && skipBOM)
        {
            // We have a match (some bytes are part of the BOM)
            // and we want to skip the BOM. Push back only the bytes
            // after the BOM.
            start = matchBOM.getBytes().length;
            count = (pos - start);

            if (count < 0)
            {
                throw new IllegalStateException("Match has more bytes than available!");
            }
        }

        inputStream.unread(buf, start, count);
    }

    /**
     * @see java.io.InputStream#close()
     */
    public void close()
        throws IOException
    {
        inputStream.close();
    }

    /**
     * @see java.io.InputStream#available()
     */
    public int available()
        throws IOException
    {
        return inputStream.available();
    }

    /**
     * @see java.io.InputStream#mark(int)
     */
    public void mark(final int readlimit)
    {
        inputStream.mark(readlimit);
    }

    /**
     * @see java.io.InputStream#markSupported()
     */
    public boolean markSupported()
    {
        return inputStream.markSupported();
    }

    /**
     * @see java.io.InputStream#read()
     */
    public int read()
        throws IOException
    {
        return inputStream.read();
    }

    /**
     * @see java.io.InputStream#read(byte[])
     */
    public int read(final byte [] b)
        throws IOException
    {
        return inputStream.read(b);
    }

    /**
     * @see java.io.InputStream#read(byte[], int, int)
     */
    public int read(final byte [] b, final int off, final int len)
        throws IOException
    {
        return inputStream.read(b, off, len);
    }

    /**
     * @see java.io.InputStream#reset()
     */
    public void reset()
        throws IOException
    {
        inputStream.reset();
    }

    /**
     * @see java.io.InputStream#skip(long)
     */
    public long skip(final long n)
        throws IOException
    {
        return inputStream.skip(n);
    }

    /**
     * Helper class to bundle encoding and BOM marker.
     *
     * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
     * @version $Id: UnicodeInputStream.java 685685 2008-08-13 21:43:27Z nbubna $
     */
    static final class UnicodeBOM
    {
        private final String encoding;

        private final byte [] bytes;

        private UnicodeBOM(final String encoding, final byte [] bytes)
        {
            this.encoding = encoding;
            this.bytes = bytes;
        }

        String getEncoding()
        {
            return encoding;
        }

        byte [] getBytes()
        {
            return bytes;
        }
    }
}