CharsetDecoder.java example

Explorer
property-db-master
/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package java.nio.charset;

import java.nio.BufferOverflowException;
import java.nio.BufferUnderflowException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;

/**
 * A converter that can convert a byte sequence from a charset into a 16-bit
 * Unicode character sequence.
 * <p>
 * The input byte sequence is wrapped by a
 * {@link java.nio.ByteBuffer ByteBuffer} and the output character sequence is a
 * {@link java.nio.CharBuffer CharBuffer}. A decoder instance should be used in
 * the following sequence, which is referred to as a decoding operation:
 * <ol>
 * <li>invoking the {@link #reset() reset} method to reset the decoder if the
 * decoder has been used;</li>
 * <li>invoking the {@link #decode(ByteBuffer, CharBuffer, boolean) decode}
 * method until the additional input is not needed, the <code>endOfInput</code>
 * parameter must be set to false, the input buffer must be filled and the
 * output buffer must be flushed between invocations;</li>
 * <li>invoking the {@link #decode(ByteBuffer, CharBuffer, boolean) decode}
 * method for the last time, and then the <code>endOfInput</code> parameter
 * must be set to true;</li>
 * <li>invoking the {@link #flush(CharBuffer) flush} method to flush the
 * output.</li>
 * </ol>
 * <p>
 * The {@link #decode(ByteBuffer, CharBuffer, boolean) decode} method will
 * convert as many bytes as possible, and the process won't stop until the input
 * bytes have run out, the output buffer has been filled or some error has
 * happened. A {@link CoderResult CoderResult} instance will be returned to
 * indicate the stop reason, and the invoker can identify the result and choose
 * further action, which includes filling the input buffer, flushing the output
 * buffer or recovering from an error and trying again.
 * <p>
 * There are two common decoding errors. One is named malformed and it is
 * returned when the input byte sequence is illegal for the current specific
 * charset, the other is named unmappable character and it is returned when a
 * problem occurs mapping a legal input byte sequence to its Unicode character
 * equivalent.
 * <p>
 * Both errors can be handled in three ways, the default one is to report the
 * error to the invoker by a {@link CoderResult CoderResult} instance, and the
 * alternatives are to ignore it or to replace the erroneous input with the
 * replacement string. The replacement string is "\uFFFD" by default and can be
 * changed by invoking {@link #replaceWith(String) replaceWith} method. The
 * invoker of this decoder can choose one way by specifying a
 * {@link CodingErrorAction CodingErrorAction} instance for each error type via
 * {@link #onMalformedInput(CodingErrorAction) onMalformedInput} method and
 * {@link #onUnmappableCharacter(CodingErrorAction) onUnmappableCharacter}
 * method.
 * <p>
 * This is an abstract class and encapsulates many common operations of the
 * decoding process for all charsets. Decoders for a specific charset should
 * extend this class and need only to implement the
 * {@link #decodeLoop(ByteBuffer, CharBuffer) decodeLoop} method for the basic
 * decoding. If a subclass maintains an internal state, it should override the
 * {@link #implFlush(CharBuffer) implFlush} method and the
 * {@link #implReset() implReset} method in addition.
 * <p>
 * This class is not thread-safe.
 *
 * @see java.nio.charset.Charset
 * @see java.nio.charset.CharsetEncoder
 */
public abstract class CharsetDecoder {
    private static final int INIT = 0;
    private static final int ONGOING = 1;
    private static final int END = 2;
    private static final int FLUSH = 3;

    private final float averageCharsPerByte;
    private final float maxCharsPerByte;

    private final Charset cs;

    private CodingErrorAction malformedInputAction;
    private CodingErrorAction unmappableCharacterAction;

    private String replacementChars;

    private int status;

    /**
     * Constructs a new <code>CharsetDecoder</code> using the given
     * <code>Charset</code>, average number and maximum number of characters
     * created by this decoder for one input byte, and the default replacement
     * string "\uFFFD".
     *
     * @param charset
     *            the <code>Charset</code> to be used by this decoder.
     * @param averageCharsPerByte
     *            the average number of characters created by this decoder for
     *            one input byte, must be positive.
     * @param maxCharsPerByte
     *            the maximum number of characters created by this decoder for
     *            one input byte, must be positive.
     * @throws IllegalArgumentException
     *             if <code>averageCharsPerByte</code> or
     *             <code>maxCharsPerByte</code> is negative.
     */
    protected CharsetDecoder(Charset charset, float averageCharsPerByte, float maxCharsPerByte) {
        if (averageCharsPerByte <= 0 || maxCharsPerByte <= 0) {
            throw new IllegalArgumentException("averageCharsPerByte and maxCharsPerByte must be positive");
        }
        if (averageCharsPerByte > maxCharsPerByte) {
            throw new IllegalArgumentException("averageCharsPerByte is greater than maxCharsPerByte");
        }
        this.averageCharsPerByte = averageCharsPerByte;
        this.maxCharsPerByte = maxCharsPerByte;
        cs = charset;
        status = INIT;
        malformedInputAction = CodingErrorAction.REPORT;
        unmappableCharacterAction = CodingErrorAction.REPORT;
        replacementChars = "\ufffd";
    }

    /**
     * Returns the average number of characters created by this decoder for a
     * single input byte.
     */
    public final float averageCharsPerByte() {
        return averageCharsPerByte;
    }

    /**
     * Returns the {@link Charset} which this decoder uses.
     */
    public final Charset charset() {
        return cs;
    }

    /**
     * This is a facade method for the decoding operation.
     * <p>
     * This method decodes the remaining byte sequence of the given byte buffer
     * into a new character buffer. This method performs a complete decoding
     * operation, resets at first, then decodes, and flushes at last.
     * <p>
     * This method should not be invoked while another {@code decode} operation
     * is ongoing.
     *
     * @param in
     *            the input buffer.
     * @return a new <code>CharBuffer</code> containing the the characters
     *         produced by this decoding operation. The buffer's limit will be
     *         the position of the last character in the buffer, and the
     *         position will be zero.
     * @throws IllegalStateException
     *             if another decoding operation is ongoing.
     * @throws MalformedInputException
     *             if an illegal input byte sequence for this charset was
     *             encountered, and the action for malformed error is
     *             {@link CodingErrorAction#REPORT CodingErrorAction.REPORT}
     * @throws UnmappableCharacterException
     *             if a legal but unmappable input byte sequence for this
     *             charset was encountered, and the action for unmappable
     *             character error is
     *             {@link CodingErrorAction#REPORT CodingErrorAction.REPORT}.
     *             Unmappable means the byte sequence at the input buffer's
     *             current position cannot be mapped to a Unicode character
     *             sequence.
     * @throws CharacterCodingException
     *             if another exception happened during the decode operation.
     */
    public final CharBuffer decode(ByteBuffer in) throws CharacterCodingException {
        reset();
        int length = (int) (in.remaining() * averageCharsPerByte);
        CharBuffer output = CharBuffer.allocate(length);
        CoderResult result = null;
        while (true) {
            result = decode(in, output, false);
            checkCoderResult(result);
            if (result.isUnderflow()) {
                break;
            } else if (result.isOverflow()) {
                output = allocateMore(output);
            }
        }
        result = decode(in, output, true);
        checkCoderResult(result);

        while (true) {
            result = flush(output);
            checkCoderResult(result);
            if (result.isOverflow()) {
                output = allocateMore(output);
            } else {
                break;
            }
        }

        output.flip();
        status = FLUSH;
        return output;
    }

    /*
     * checks the result whether it needs to throw CharacterCodingException.
     */
    private void checkCoderResult(CoderResult result) throws CharacterCodingException {
        if (result.isMalformed() && malformedInputAction == CodingErrorAction.REPORT) {
            throw new MalformedInputException(result.length());
        } else if (result.isUnmappable() && unmappableCharacterAction == CodingErrorAction.REPORT) {
            throw new UnmappableCharacterException(result.length());
        }
    }

    /*
     * original output is full and doesn't have remaining. allocate more space
     * to new CharBuffer and return it, the contents in the given buffer will be
     * copied into the new buffer.
     */
    private CharBuffer allocateMore(CharBuffer output) {
        if (output.capacity() == 0) {
            return CharBuffer.allocate(1);
        }
        CharBuffer result = CharBuffer.allocate(output.capacity() * 2);
        output.flip();
        result.put(output);
        return result;
    }

    /**
     * Decodes bytes starting at the current position of the given input buffer,
     * and writes the equivalent character sequence into the given output buffer
     * from its current position.
     * <p>
     * The buffers' position will be changed with the reading and writing
     * operation, but their limits and marks will be kept intact.
     * <p>
     * A <code>CoderResult</code> instance will be returned according to
     * following rules:
     * <ul>
     * <li>{@link CoderResult#OVERFLOW CoderResult.OVERFLOW} indicates that
     * even though not all of the input has been processed, the buffer the
     * output is being written to has reached its capacity. In the event of this
     * code being returned this method should be called once more with an
     * <code>out</code> argument that has not already been filled.</li>
     * <li>{@link CoderResult#UNDERFLOW CoderResult.UNDERFLOW} indicates that
     * as many bytes as possible in the input buffer have been decoded. If there
     * is no further input and no remaining bytes in the input buffer then this
     * operation may be regarded as complete. Otherwise, this method should be
     * called once more with additional input.</li>
     * <li>A {@link CoderResult#malformedForLength(int) malformed input} result
     * indicates that some malformed input error has been encountered, and the
     * erroneous bytes start at the input buffer's position and their number can
     * be got by result's {@link CoderResult#length() length}. This kind of
     * result can be returned only if the malformed action is
     * {@link CodingErrorAction#REPORT CodingErrorAction.REPORT}. </li>
     * <li>A {@link CoderResult#unmappableForLength(int) unmappable character}
     * result indicates that some unmappable character error has been
     * encountered, and the erroneous bytes start at the input buffer's position
     * and their number can be got by result's
     * {@link CoderResult#length() length}. This kind of result can be returned
     * only if the unmappable character action is
     * {@link CodingErrorAction#REPORT CodingErrorAction.REPORT}. </li>
     * </ul>
     * <p>
     * The <code>endOfInput</code> parameter indicates that the invoker cannot
     * provide further input. This parameter is true if and only if the bytes in
     * current input buffer are all inputs for this decoding operation. Note
     * that it is common and won't cause an error if the invoker sets false and
     * then can't provide more input, while it may cause an error if the invoker
     * always sets true in several consecutive invocations. This would make the
     * remaining input to be treated as malformed input.
     * <p>
     * This method invokes the
     * {@link #decodeLoop(ByteBuffer, CharBuffer) decodeLoop} method to
     * implement the basic decode logic for a specific charset.
     *
     * @param in
     *            the input buffer.
     * @param out
     *            the output buffer.
     * @param endOfInput
     *            true if all the input characters have been provided.
     * @return a <code>CoderResult</code> instance which indicates the reason
     *         of termination.
     * @throws IllegalStateException
     *             if decoding has started or no more input is needed in this
     *             decoding progress.
     * @throws CoderMalfunctionError
     *             if the {@link #decodeLoop(ByteBuffer, CharBuffer) decodeLoop}
     *             method threw an <code>BufferUnderflowException</code> or
     *             <code>BufferOverflowException</code>.
     */
    public final CoderResult decode(ByteBuffer in, CharBuffer out,
            boolean endOfInput) {
        /*
         * status check
         */
        if ((status == FLUSH) || (!endOfInput && status == END)) {
            throw new IllegalStateException();
        }

        CoderResult result = null;

        // begin to decode
        while (true) {
            CodingErrorAction action = null;
            try {
                result = decodeLoop(in, out);
            } catch (BufferOverflowException ex) {
                // unexpected exception
                throw new CoderMalfunctionError(ex);
            } catch (BufferUnderflowException ex) {
                // unexpected exception
                throw new CoderMalfunctionError(ex);
            }

            /*
             * result handling
             */
            if (result.isUnderflow()) {
                int remaining = in.remaining();
                status = endOfInput ? END : ONGOING;
                if (endOfInput && remaining > 0) {
                    result = CoderResult.malformedForLength(remaining);
                } else {
                    return result;
                }
            }
            if (result.isOverflow()) {
                return result;
            }
            // set coding error handle action
            action = malformedInputAction;
            if (result.isUnmappable()) {
                action = unmappableCharacterAction;
            }
            // If the action is IGNORE or REPLACE, we should continue decoding.
            if (action == CodingErrorAction.REPLACE) {
                if (out.remaining() < replacementChars.length()) {
                    return CoderResult.OVERFLOW;
                }
                out.put(replacementChars);
            } else {
                if (action != CodingErrorAction.IGNORE)
                    return result;
            }
            in.position(in.position() + result.length());
        }
    }

    /**
     * Decodes bytes into characters. This method is called by the
     * {@link #decode(ByteBuffer, CharBuffer, boolean) decode} method.
     * <p>
     * This method will implement the essential decoding operation, and it won't
     * stop decoding until either all the input bytes are read, the output
     * buffer is filled, or some exception is encountered. Then it will return a
     * <code>CoderResult</code> object indicating the result of current
     * decoding operation. The rules to construct the <code>CoderResult</code>
     * are the same as for
     * {@link #decode(ByteBuffer, CharBuffer, boolean) decode}. When an
     * exception is encountered in the decoding operation, most implementations
     * of this method will return a relevant result object to the
     * {@link #decode(ByteBuffer, CharBuffer, boolean) decode} method, and some
     * performance optimized implementation may handle the exception and
     * implement the error action itself.
     * <p>
     * The buffers are scanned from their current positions, and their positions
     * will be modified accordingly, while their marks and limits will be
     * intact. At most {@link ByteBuffer#remaining() in.remaining()} characters
     * will be read, and {@link CharBuffer#remaining() out.remaining()} bytes
     * will be written.
     * <p>
     * Note that some implementations may pre-scan the input buffer and return a
     * <code>CoderResult.UNDERFLOW</code> until it receives sufficient input.
     *
     * @param in
     *            the input buffer.
     * @param out
     *            the output buffer.
     * @return a <code>CoderResult</code> instance indicating the result.
     */
    protected abstract CoderResult decodeLoop(ByteBuffer in, CharBuffer out);

    /**
     * Gets the charset detected by this decoder; this method is optional.
     * <p>
     * If implementing an auto-detecting charset, then this decoder returns the
     * detected charset from this method when it is available. The returned
     * charset will be the same for the rest of the decode operation.
     * <p>
     * If insufficient bytes have been read to determine the charset, an
     * <code>IllegalStateException</code> will be thrown.
     * <p>
     * The default implementation always throws
     * <code>UnsupportedOperationException</code>, so it should be overridden
     * by a subclass if needed.
     *
     * @return the charset detected by this decoder, or null if it is not yet
     *         determined.
     * @throws UnsupportedOperationException
     *             if this decoder does not implement an auto-detecting charset.
     * @throws IllegalStateException
     *             if insufficient bytes have been read to determine the
     *             charset.
     */
    public Charset detectedCharset() {
        throw new UnsupportedOperationException();
    }

    /**
     * Flushes this decoder.
     *
     * This method will call {@link #implFlush(CharBuffer) implFlush}. Some
     * decoders may need to write some characters to the output buffer when they
     * have read all input bytes; subclasses can override
     * {@link #implFlush(CharBuffer) implFlush} to perform the writing operation.
     * <p>
     * The maximum number of written bytes won't be larger than
     * {@link CharBuffer#remaining() out.remaining()}. If some decoder wants to
     * write more bytes than an output buffer's remaining space allows, then a
     * <code>CoderResult.OVERFLOW</code> will be returned, and this method
     * must be called again with a character buffer that has more remaining
     * space. Otherwise this method will return
     * <code>CoderResult.UNDERFLOW</code>, which means one decoding process
     * has been completed successfully.
     * <p>
     * During the flush, the output buffer's position will be changed
     * accordingly, while its mark and limit will be intact.
     *
     * @param out
     *            the given output buffer.
     * @return <code>CoderResult.UNDERFLOW</code> or
     *         <code>CoderResult.OVERFLOW</code>.
     * @throws IllegalStateException
     *             if this decoder hasn't read all input bytes during one
     *             decoding process, which means neither after calling
     *             {@link #decode(ByteBuffer) decode(ByteBuffer)} nor after
     *             calling {@link #decode(ByteBuffer, CharBuffer, boolean)
     *             decode(ByteBuffer, CharBuffer, boolean)} with true as value
     *             for the last boolean parameter.
     */
    public final CoderResult flush(CharBuffer out) {
        if (status != END && status != INIT) {
            throw new IllegalStateException();
        }
        CoderResult result = implFlush(out);
        if (result == CoderResult.UNDERFLOW) {
            status = FLUSH;
        }
        return result;
    }

    /**
     * Flushes this decoder. The default implementation does nothing and always
     * returns <code>CoderResult.UNDERFLOW</code>; this method can be
     * overridden if needed.
     *
     * @param out
     *            the output buffer.
     * @return <code>CoderResult.UNDERFLOW</code> or
     *         <code>CoderResult.OVERFLOW</code>.
     */
    protected CoderResult implFlush(CharBuffer out) {
        return CoderResult.UNDERFLOW;
    }

    /**
     * Notifies that this decoder's <code>CodingErrorAction</code> specified
     * for malformed input error has been changed. The default implementation
     * does nothing; this method can be overridden if needed.
     *
     * @param newAction
     *            the new action.
     */
    protected void implOnMalformedInput(CodingErrorAction newAction) {
        // default implementation is empty
    }

    /**
     * Notifies that this decoder's <code>CodingErrorAction</code> specified
     * for unmappable character error has been changed. The default
     * implementation does nothing; this method can be overridden if needed.
     *
     * @param newAction
     *            the new action.
     */
    protected void implOnUnmappableCharacter(CodingErrorAction newAction) {
        // default implementation is empty
    }

    /**
     * Notifies that this decoder's replacement has been changed. The default
     * implementation does nothing; this method can be overridden if needed.
     *
     * @param newReplacement
     *            the new replacement string.
     */
    protected void implReplaceWith(String newReplacement) {
        // default implementation is empty
    }

    /**
     * Reset this decoder's charset related state. The default implementation
     * does nothing; this method can be overridden if needed.
     */
    protected void implReset() {
        // default implementation is empty
    }

    /**
     * Indicates whether this decoder implements an auto-detecting charset.
     *
     * @return <code>true</code> if this decoder implements an auto-detecting
     *         charset.
     */
    public boolean isAutoDetecting() {
        return false;
    }

    /**
     * Indicates whether this decoder has detected a charset; this method is
     * optional.
     * <p>
     * If this decoder implements an auto-detecting charset, then this method
     * may start to return true during decoding operation to indicate that a
     * charset has been detected in the input bytes and that the charset can be
     * retrieved by invoking the {@link #detectedCharset() detectedCharset}
     * method.
     * <p>
     * Note that a decoder that implements an auto-detecting charset may still
     * succeed in decoding a portion of the given input even when it is unable
     * to detect the charset. For this reason users should be aware that a
     * <code>false</code> return value does not indicate that no decoding took
     * place.
     * <p>
     * The default implementation always throws an
     * <code>UnsupportedOperationException</code>; it should be overridden by
     * a subclass if needed.
     *
     * @return <code>true</code> if this decoder has detected a charset.
     * @throws UnsupportedOperationException
     *             if this decoder doesn't implement an auto-detecting charset.
     */
    public boolean isCharsetDetected() {
        throw new UnsupportedOperationException();
    }

    /**
     * Returns this decoder's <code>CodingErrorAction</code> when malformed input
     * occurred during the decoding process.
     */
    public CodingErrorAction malformedInputAction() {
        return malformedInputAction;
    }

    /**
     * Returns the maximum number of characters which can be created by this
     * decoder for one input byte, must be positive.
     */
    public final float maxCharsPerByte() {
        return maxCharsPerByte;
    }

    /**
     * Sets this decoder's action on malformed input errors.
     *
     * This method will call the
     * {@link #implOnMalformedInput(CodingErrorAction) implOnMalformedInput}
     * method with the given new action as argument.
     *
     * @param newAction
     *            the new action on malformed input error.
     * @return this decoder.
     * @throws IllegalArgumentException
     *             if {@code newAction} is {@code null}.
     */
    public final CharsetDecoder onMalformedInput(CodingErrorAction newAction) {
        if (newAction == null) {
            throw new IllegalArgumentException("newAction == null");
        }
        malformedInputAction = newAction;
        implOnMalformedInput(newAction);
        return this;
    }

    /**
     * Sets this decoder's action on unmappable character errors.
     *
     * This method will call the
     * {@link #implOnUnmappableCharacter(CodingErrorAction) implOnUnmappableCharacter}
     * method with the given new action as argument.
     *
     * @param newAction
     *            the new action on unmappable character error.
     * @return this decoder.
     * @throws IllegalArgumentException
     *             if {@code newAction} is {@code null}.
     */
    public final CharsetDecoder onUnmappableCharacter(CodingErrorAction newAction) {
        if (newAction == null) {
            throw new IllegalArgumentException("newAction == null");
        }
        unmappableCharacterAction = newAction;
        implOnUnmappableCharacter(newAction);
        return this;
    }

    /**
     * Returns the replacement string, which is never null or empty.
     */
    public final String replacement() {
        return replacementChars;
    }

    /**
     * Sets the new replacement string.
     *
     * This method first checks the given replacement's validity, then changes
     * the replacement value, and at last calls the
     * {@link #implReplaceWith(String) implReplaceWith} method with the given
     * new replacement as argument.
     *
     * @param replacement
     *            the replacement string, cannot be null or empty. Its length
     *            cannot be larger than {@link #maxCharsPerByte()}.
     * @return this decoder.
     * @throws IllegalArgumentException
     *             if the given replacement cannot satisfy the requirement
     *             mentioned above.
     */
    public final CharsetDecoder replaceWith(String replacement) {
        if (replacement == null) {
            throw new IllegalArgumentException("replacement == null");
        }
        if (replacement.isEmpty()) {
            throw new IllegalArgumentException("replacement.isEmpty()");
        }
        if (replacement.length() > maxCharsPerByte()) {
            throw new IllegalArgumentException("replacement length > maxCharsPerByte: " +
                    replacement.length() + " > " + maxCharsPerByte());
        }
        replacementChars = replacement;
        implReplaceWith(replacement);
        return this;
    }

    /**
     * Resets this decoder. This method will reset the internal status, and then
     * calls <code>implReset()</code> to reset any status related to the
     * specific charset.
     *
     * @return this decoder.
     */
    public final CharsetDecoder reset() {
        status = INIT;
        implReset();
        return this;
    }

    /**
     * Returns this decoder's <code>CodingErrorAction</code> when an unmappable
     * character error occurred during the decoding process.
     */
    public CodingErrorAction unmappableCharacterAction() {
        return unmappableCharacterAction;
    }
}