COMPOUND_TEXT_Decoder.java example

Explorer
jdk7u-jdk-master
/*
 * Copyright (c) 2001, 2005, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package sun.nio.cs.ext;
import java.io.ByteArrayOutputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.*;

/**
 * An algorithmic conversion from COMPOUND_TEXT to Unicode.
 */

public class COMPOUND_TEXT_Decoder extends CharsetDecoder {

    private static final int NORMAL_BYTES             =  0;
    private static final int NONSTANDARD_BYTES        =  1;
    private static final int VERSION_SEQUENCE_V       =  2;
    private static final int VERSION_SEQUENCE_TERM    =  3;
    private static final int ESCAPE_SEQUENCE          =  4;
    private static final int CHARSET_NGIIF            =  5;
    private static final int CHARSET_NLIIF            =  6;
    private static final int CHARSET_NLIF             =  7;
    private static final int CHARSET_NRIIF            =  8;
    private static final int CHARSET_NRIF             =  9;
    private static final int CHARSET_NONSTANDARD_FOML = 10;
    private static final int CHARSET_NONSTANDARD_OML  = 11;
    private static final int CHARSET_NONSTANDARD_ML   = 12;
    private static final int CHARSET_NONSTANDARD_L    = 13;
    private static final int CHARSET_NONSTANDARD      = 14;
    private static final int CHARSET_LIIF             = 15;
    private static final int CHARSET_LIF              = 16;
    private static final int CHARSET_RIIF             = 17;
    private static final int CHARSET_RIF              = 18;
    private static final int CONTROL_SEQUENCE_PIF     = 19;
    private static final int CONTROL_SEQUENCE_IF      = 20;
    private static final int EXTENSION_ML             = 21;
    private static final int EXTENSION_L              = 22;
    private static final int EXTENSION                = 23;
    private static final int ESCAPE_SEQUENCE_OTHER    = 24;

    private static final String ERR_LATIN1 = "ISO8859_1 unsupported";
    private static final String ERR_ILLSTATE = "Illegal state";
    private static final String ERR_ESCBYTE =
        "Illegal byte in 0x1B escape sequence";
    private static final String ERR_ENCODINGBYTE =
        "Illegal byte in non-standard character set name";
    private static final String ERR_CTRLBYTE =
        "Illegal byte in 0x9B control sequence";
    private static final String ERR_CTRLPI =
        "P following I in 0x9B control sequence";
    private static final String ERR_VERSTART =
        "Versioning escape sequence can only appear at start of byte stream";
    private static final String ERR_VERMANDATORY =
        "Cannot parse mandatory extensions";
    private static final String ERR_ENCODING = "Unknown encoding: ";
    private static final String ERR_FLUSH =
        "Escape sequence, control sequence, or ML extension not terminated";

    private int state = NORMAL_BYTES ;
    private int ext_count, ext_offset;
    private boolean versionSequenceAllowed = true;
    private byte[] byteBuf = new byte[1];
    private ByteBuffer inBB = ByteBuffer.allocate(16);
    private ByteArrayOutputStream queue = new ByteArrayOutputStream(),
        encodingQueue = new ByteArrayOutputStream();

    private CharsetDecoder glDecoder, grDecoder, nonStandardDecoder,
        lastDecoder;
    private boolean glHigh = false, grHigh = true;


    public COMPOUND_TEXT_Decoder(Charset cs) {
        super(cs, 1.0f, 1.0f);
        try {
            // Initial state in ISO 2022 designates Latin-1 charset.
            glDecoder = Charset.forName("ASCII").newDecoder();
            grDecoder = Charset.forName("ISO8859_1").newDecoder();
        } catch (IllegalArgumentException e) {
            error(ERR_LATIN1);
        }
        initDecoder(glDecoder);
        initDecoder(grDecoder);
    }

    protected CoderResult decodeLoop(ByteBuffer src, CharBuffer des) {
        CoderResult cr = CoderResult.UNDERFLOW;
        byte[] input = src.array();
        int inOff = src.arrayOffset() + src.position();
        int inEnd = src.arrayOffset() + src.limit();

        try {
            while (inOff < inEnd && cr.isUnderflow()) {
                // Byte parsing is done with shorts instead of bytes because
                // Java bytes are signed, while COMPOUND_TEXT bytes are not. If
                // we used the Java byte type, the > and < tests during parsing
                // would not work correctly.
                cr = handleByte((short)(input[inOff] & 0xFF), des);
                inOff++;
            }
            return cr;
        } finally {
            src.position(inOff - src.arrayOffset());
        }
    }

    private CoderResult handleByte(short newByte, CharBuffer cb) {
        CoderResult cr = CoderResult.UNDERFLOW;
        switch (state) {
        case NORMAL_BYTES:
            cr= normalBytes(newByte, cb);
            break;
        case NONSTANDARD_BYTES:
            cr = nonStandardBytes(newByte, cb);
            break;
        case VERSION_SEQUENCE_V:
        case VERSION_SEQUENCE_TERM:
            cr = versionSequence(newByte);
            break;
        case ESCAPE_SEQUENCE:
            cr = escapeSequence(newByte);
            break;
        case CHARSET_NGIIF:
            cr = charset94N(newByte);
            break;
        case CHARSET_NLIIF:
        case CHARSET_NLIF:
            cr = charset94NL(newByte, cb);
            break;
        case CHARSET_NRIIF:
        case CHARSET_NRIF:
            cr = charset94NR(newByte, cb);
            break;
        case CHARSET_NONSTANDARD_FOML:
        case CHARSET_NONSTANDARD_OML:
        case CHARSET_NONSTANDARD_ML:
        case CHARSET_NONSTANDARD_L:
        case CHARSET_NONSTANDARD:
            cr = charsetNonStandard(newByte, cb);
            break;
        case CHARSET_LIIF:
        case CHARSET_LIF:
            cr = charset9496L(newByte, cb);
            break;
        case CHARSET_RIIF:
        case CHARSET_RIF:
            cr = charset9496R(newByte, cb);
            break;
        case CONTROL_SEQUENCE_PIF:
        case CONTROL_SEQUENCE_IF:
            cr = controlSequence(newByte);
            break;
        case EXTENSION_ML:
        case EXTENSION_L:
        case EXTENSION:
            cr = extension(newByte);
            break;
        case ESCAPE_SEQUENCE_OTHER:
            cr = escapeSequenceOther(newByte);
            break;
        default:
            error(ERR_ILLSTATE);
        }
        return cr;
    }

    private CoderResult normalBytes(short newByte, CharBuffer cb) {
        CoderResult cr = CoderResult.UNDERFLOW;
        if ((newByte >= 0x00 && newByte <= 0x1F) || // C0
            (newByte >= 0x80 && newByte <= 0x9F)) { // C1
            char newChar;

            switch (newByte) {
            case 0x1B:
                state = ESCAPE_SEQUENCE;
                queue.write(newByte);
                return cr;
            case 0x9B:
                state = CONTROL_SEQUENCE_PIF;
                versionSequenceAllowed = false;
                queue.write(newByte);
                return cr;
            case 0x09:
                versionSequenceAllowed = false;
                newChar = '\t';
                break;
            case 0x0A:
                versionSequenceAllowed = false;
                newChar = '\n';
                break;
            default:
                versionSequenceAllowed = false;
                return cr;
            }
            if (!cb.hasRemaining())
                return CoderResult.OVERFLOW;
            else
                cb.put(newChar);
        } else {
            CharsetDecoder decoder;
            boolean high;
            versionSequenceAllowed = false;

            if (newByte >= 0x20 && newByte <= 0x7F) {
                decoder = glDecoder;
                high = glHigh;
            } else /* if (newByte >= 0xA0 && newByte <= 0xFF) */ {
                decoder = grDecoder;
                high = grHigh;
            }
            if (lastDecoder != null && decoder != lastDecoder) {
                cr = flushDecoder(lastDecoder, cb);
            }
            lastDecoder = decoder;

            if (decoder != null) {
                byte b = (byte)newByte;
                if (high) {
                    b |= 0x80;
                } else {
                    b &= 0x7F;
                }
                inBB.put(b);
                inBB.flip();
                cr = decoder.decode(inBB, cb, false);
                if (!inBB.hasRemaining() || cr.isMalformed()) {
                    inBB.clear();
                } else {
                  int pos = inBB.limit();
                  inBB.clear();
                  inBB.position(pos);
                }
            } else if (cb.remaining() < replacement().length()) {
                cb.put(replacement());
            } else {
                return CoderResult.OVERFLOW;
            }
        }
        return cr;
    }

    private CoderResult nonStandardBytes(short newByte, CharBuffer cb)
    {
        CoderResult cr = CoderResult.UNDERFLOW;
        if (nonStandardDecoder != null) {
            //byteBuf[0] = (byte)newByte;
            inBB.put((byte)newByte);
            inBB.flip();
            cr = nonStandardDecoder.decode(inBB, cb, false);
            if (!inBB.hasRemaining()) {
                inBB.clear();
            } else {
                int pos = inBB.limit();
                inBB.clear();
                inBB.position(pos);
            }
        } else if (cb.remaining() < replacement().length()) {
            cb.put(replacement());
        } else {
            return CoderResult.OVERFLOW;
        }

        ext_offset++;
        if (ext_offset >= ext_count) {
            ext_offset = ext_count = 0;
            state = NORMAL_BYTES;
            cr = flushDecoder(nonStandardDecoder, cb);
            nonStandardDecoder = null;
        }
        return cr;
    }

    private CoderResult escapeSequence(short newByte) {
        switch (newByte) {
        case 0x23:
            state = VERSION_SEQUENCE_V;
            break;
        case 0x24:
            state = CHARSET_NGIIF;
            versionSequenceAllowed = false;
            break;
        case 0x25:
            state = CHARSET_NONSTANDARD_FOML;
            versionSequenceAllowed = false;
            break;
        case 0x28:
            state = CHARSET_LIIF;
            versionSequenceAllowed = false;
            break;
        case 0x29:
        case 0x2D:
            state = CHARSET_RIIF;
            versionSequenceAllowed = false;
            break;
        default:
            // escapeSequenceOther will write to queue if appropriate
            return escapeSequenceOther(newByte);
        }

        queue.write(newByte);
        return CoderResult.UNDERFLOW;
    }

    /**
     * Test for unknown, but valid, escape sequences.
     */
    private CoderResult escapeSequenceOther(short newByte) {
        if (newByte >= 0x20 && newByte <= 0x2F) {
            // {I}
            state = ESCAPE_SEQUENCE_OTHER;
            versionSequenceAllowed = false;
            queue.write(newByte);
        } else if (newByte >= 0x30 && newByte <= 0x7E) {
            // F -- end of sequence
            state = NORMAL_BYTES;
            versionSequenceAllowed = false;
            queue.reset();
        } else {
            return malformedInput(ERR_ESCBYTE);
        }
        return CoderResult.UNDERFLOW;
    }

    /**
     * Parses directionality, as well as unknown, but valid, control sequences.
     */
    private CoderResult controlSequence(short newByte) {
        if (newByte >= 0x30 && newByte <= 0x3F) {
            // {P}
            if (state == CONTROL_SEQUENCE_IF) {
                // P no longer allowed
                return malformedInput(ERR_CTRLPI);
            }
            queue.write(newByte);
        } else if (newByte >= 0x20 && newByte <= 0x2F) {
            // {I}
            state = CONTROL_SEQUENCE_IF;
            queue.write(newByte);
        } else if (newByte >= 0x40 && newByte <= 0x7E) {
            // F -- end of sequence
            state = NORMAL_BYTES;
            queue.reset();
        } else {
            return malformedInput(ERR_CTRLBYTE);
        }
        return CoderResult.UNDERFLOW;
    }

    private CoderResult versionSequence(short newByte) {
        if (state == VERSION_SEQUENCE_V) {
            if (newByte >= 0x20 && newByte <= 0x2F) {
                state = VERSION_SEQUENCE_TERM;
                queue.write(newByte);
            } else {
                return escapeSequenceOther(newByte);
            }
        } else /* if (state == VERSION_SEQUENCE_TERM) */ {
            switch (newByte) {
            case 0x30:
                if (!versionSequenceAllowed) {
                    return malformedInput(ERR_VERSTART);
                }

                // OK to ignore extensions
                versionSequenceAllowed = false;
                state = NORMAL_BYTES;
                queue.reset();
                break;
            case 0x31:
                return malformedInput((versionSequenceAllowed)
                               ? ERR_VERMANDATORY : ERR_VERSTART);
            default:
                return escapeSequenceOther(newByte);
            }
        }
        return CoderResult.UNDERFLOW;
    }

    private CoderResult charset94N(short newByte) {
        switch (newByte) {
        case 0x28:
            state = CHARSET_NLIIF;
            break;
        case 0x29:
            state = CHARSET_NRIIF;
            break;
        default:
            // escapeSequenceOther will write byte if appropriate
            return escapeSequenceOther(newByte);
        }

        queue.write(newByte);
        return CoderResult.UNDERFLOW;
    }

    private CoderResult charset94NL(short newByte, CharBuffer cb) {
        if (newByte >= 0x21 &&
            newByte <= (state == CHARSET_NLIIF ? 0x23 : 0x2F)) {
            // {I}
            state = CHARSET_NLIF;
            queue.write(newByte);
        } else if (newByte >= 0x40 && newByte <= 0x7E) {
            // F
            return switchDecoder(newByte, cb);
        } else {
            return escapeSequenceOther(newByte);
        }
        return CoderResult.UNDERFLOW;
    }

    private CoderResult charset94NR(short newByte, CharBuffer cb)
    {
        if (newByte >= 0x21 &&
            newByte <= (state == CHARSET_NRIIF ? 0x23 : 0x2F)) {
            // {I}
            state = CHARSET_NRIF;
            queue.write(newByte);
        } else if (newByte >= 0x40 && newByte <= 0x7E) {
            // F
            return switchDecoder(newByte, cb);
        } else {
            return escapeSequenceOther(newByte);
        }
        return CoderResult.UNDERFLOW;
    }

    private CoderResult charset9496L(short newByte, CharBuffer cb) {
        if (newByte >= 0x21 &&
            newByte <= (state == CHARSET_LIIF ? 0x23 : 0x2F)) {
            // {I}
            state = CHARSET_LIF;
            queue.write(newByte);
            return CoderResult.UNDERFLOW;
        } else if (newByte >= 0x40 && newByte <= 0x7E) {
            // F
            return switchDecoder(newByte, cb);
        } else {
            return escapeSequenceOther(newByte);
        }
    }

    private CoderResult charset9496R(short newByte, CharBuffer cb) {
        if (newByte >= 0x21 &&
            newByte <= (state == CHARSET_RIIF ? 0x23 : 0x2F)) {
            // {I}
            state = CHARSET_RIF;
            queue.write(newByte);
            return CoderResult.UNDERFLOW;
        } else if (newByte >= 0x40 && newByte <= 0x7E) {
            // F
            return switchDecoder(newByte, cb);
        } else {
            return escapeSequenceOther(newByte);
        }
    }

    private CoderResult charsetNonStandard(short newByte, CharBuffer cb) {
        switch (state) {
        case CHARSET_NONSTANDARD_FOML:
            if (newByte == 0x2F) {
                state = CHARSET_NONSTANDARD_OML;
                queue.write(newByte);
            } else {
                return escapeSequenceOther(newByte);
            }
            break;
        case CHARSET_NONSTANDARD_OML:
            if (newByte >= 0x30 && newByte <= 0x34) {
                state = CHARSET_NONSTANDARD_ML;
                queue.write(newByte);
            } else if (newByte >= 0x35 && newByte <= 0x3F) {
                state = EXTENSION_ML;
                queue.write(newByte);
            } else {
                return escapeSequenceOther(newByte);
            }
            break;
        case CHARSET_NONSTANDARD_ML:
            ext_count = (newByte & 0x7F) * 0x80;
            state = CHARSET_NONSTANDARD_L;
            break;
        case CHARSET_NONSTANDARD_L:
            ext_count = ext_count + (newByte & 0x7F);
            state = (ext_count > 0) ? CHARSET_NONSTANDARD : NORMAL_BYTES;
            break;
        case CHARSET_NONSTANDARD:
            if (newByte == 0x3F || newByte == 0x2A) {
                queue.reset(); // In this case, only current byte is bad.
                return malformedInput(ERR_ENCODINGBYTE);
            }
            ext_offset++;
            if (ext_offset >= ext_count) {
                ext_offset = ext_count = 0;
                state = NORMAL_BYTES;
                queue.reset();
                encodingQueue.reset();
            } else if (newByte == 0x02) {
                // encoding name terminator
                return switchDecoder((short)0, cb);
            } else {
                encodingQueue.write(newByte);
            }
            break;
        default:
            error(ERR_ILLSTATE);
        }
        return CoderResult.UNDERFLOW;
    }

    private CoderResult extension(short newByte) {
        switch (state) {
        case EXTENSION_ML:
            ext_count = (newByte & 0x7F) * 0x80;
            state = EXTENSION_L;
            break;
        case EXTENSION_L:
            ext_count = ext_count + (newByte & 0x7F);
            state = (ext_count > 0) ? EXTENSION : NORMAL_BYTES;
            break;
        case EXTENSION:
            // Consume 'count' bytes. Don't bother putting them on the queue.
            // There may be too many and we can't do anything with them anyway.
            ext_offset++;
            if (ext_offset >= ext_count) {
                ext_offset = ext_count = 0;
                state = NORMAL_BYTES;
                queue.reset();
            }
            break;
        default:
            error(ERR_ILLSTATE);
        }
        return CoderResult.UNDERFLOW;
    }

    /**
     * Preconditions:
     *   1. 'queue' contains ControlSequence.escSequence
     *   2. 'encodingQueue' contains ControlSequence.encoding
     */
    private CoderResult switchDecoder(short lastByte, CharBuffer cb) {
        CoderResult cr = CoderResult.UNDERFLOW;
        CharsetDecoder decoder = null;
        boolean high = false;
        byte[] escSequence;
        byte[] encoding = null;

        if (lastByte != 0) {
            queue.write(lastByte);
        }

        escSequence = queue.toByteArray();
        queue.reset();

        if (state == CHARSET_NONSTANDARD) {
            encoding = encodingQueue.toByteArray();
            encodingQueue.reset();
            decoder = CompoundTextSupport.
                getNonStandardDecoder(escSequence, encoding);
        } else {
            decoder = CompoundTextSupport.getStandardDecoder(escSequence);
            high = CompoundTextSupport.getHighBit(escSequence);
        }
        if (decoder != null) {
            initDecoder(decoder);
        } else if (unmappableCharacterAction() == CodingErrorAction.REPORT) {
            int badInputLength = 1;
            if (encoding != null) {
                badInputLength = encoding.length;
            } else if (escSequence.length > 0) {
                badInputLength = escSequence.length;
            }
            return CoderResult.unmappableForLength(badInputLength);
        }

        if (state == CHARSET_NLIIF || state == CHARSET_NLIF ||
            state == CHARSET_LIIF || state == CHARSET_LIF)
        {
            if (lastDecoder == glDecoder) {
                cr = flushDecoder(glDecoder, cb);
            }
            glDecoder = lastDecoder = decoder;
            glHigh = high;
            state = NORMAL_BYTES;
        } else if (state == CHARSET_NRIIF || state == CHARSET_NRIF ||
                   state == CHARSET_RIIF || state == CHARSET_RIF) {
            if (lastDecoder == grDecoder) {
                cr = flushDecoder(grDecoder, cb);
            }
            grDecoder = lastDecoder = decoder;
            grHigh = high;
            state = NORMAL_BYTES;
        } else if (state == CHARSET_NONSTANDARD) {
            if (lastDecoder != null) {
                cr = flushDecoder(lastDecoder, cb);
                lastDecoder = null;
            }
            nonStandardDecoder = decoder;
            state = NONSTANDARD_BYTES;
        } else {
            error(ERR_ILLSTATE);
        }
        return cr;
    }

    private ByteBuffer fbb= ByteBuffer.allocate(0);
    private CoderResult flushDecoder(CharsetDecoder dec, CharBuffer cb) {
        dec.decode(fbb, cb, true);
        CoderResult cr = dec.flush(cb);
        dec.reset();  //reuse
        return cr;
    }

    private CoderResult malformedInput(String msg) {
        int badInputLength = queue.size() + 1 /* current byte */ ;
        queue.reset();
        //TBD: nowhere to put the msg in CoderResult
        return CoderResult.malformedForLength(badInputLength);
    }

    private void error(String msg) {
        // For now, throw InternalError. Convert to 'assert' keyword later.
        throw new InternalError(msg);
    }

    protected CoderResult implFlush(CharBuffer out) {
        CoderResult cr = CoderResult.UNDERFLOW;
        if (lastDecoder != null)
          cr = flushDecoder(lastDecoder, out);
        if (state != NORMAL_BYTES)
            //TBD message ERR_FLUSH;
            cr = CoderResult.malformedForLength(0);
        reset();
        return cr;
    }

    /**
     * Resets the decoder.
     * Call this method to reset the decoder to its initial state
     */
    protected void implReset() {
        state = NORMAL_BYTES;
        ext_count = ext_offset = 0;
        versionSequenceAllowed = true;
        queue.reset();
        encodingQueue.reset();
        nonStandardDecoder = lastDecoder = null;
        glHigh = false;
        grHigh = true;
        try {
            // Initial state in ISO 2022 designates Latin-1 charset.
            glDecoder = Charset.forName("ASCII").newDecoder();
            grDecoder = Charset.forName("ISO8859_1").newDecoder();
        } catch (IllegalArgumentException e) {
            error(ERR_LATIN1);
        }
        initDecoder(glDecoder);
        initDecoder(grDecoder);
    }

    protected void implOnMalformedInput(CodingErrorAction newAction) {
        if (glDecoder != null)
            glDecoder.onMalformedInput(newAction);
        if (grDecoder != null)
            grDecoder.onMalformedInput(newAction);
        if (nonStandardDecoder != null)
            nonStandardDecoder.onMalformedInput(newAction);
    }

    protected void implOnUnmappableCharacter(CodingErrorAction newAction) {
        if (glDecoder != null)
            glDecoder.onUnmappableCharacter(newAction);
        if (grDecoder != null)
            grDecoder.onUnmappableCharacter(newAction);
        if (nonStandardDecoder != null)
            nonStandardDecoder.onUnmappableCharacter(newAction);
    }

    protected void implReplaceWith(String newReplacement) {
        if (glDecoder != null)
            glDecoder.replaceWith(newReplacement);
        if (grDecoder != null)
            grDecoder.replaceWith(newReplacement);
        if (nonStandardDecoder != null)
            nonStandardDecoder.replaceWith(newReplacement);
    }

    private void initDecoder(CharsetDecoder dec) {
        dec.onUnmappableCharacter(CodingErrorAction.REPLACE)
            .replaceWith(replacement());
    }
}