CodeSet.java example

Explorer
jacorb-master
/*
 *        JacORB - a free Java ORB
 *
 *   Copyright (C) 1997-2014 Gerald Brose / The JacORB Team.
 *
 *   This library is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU Library General Public
 *   License as published by the Free Software Foundation; either
 *   version 2 of the License, or (at your option) any later version.
 *
 *   This library is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *   Library General Public License for more details.
 *
 *   You should have received a copy of the GNU Library General Public
 *   License along with this library; if not, write to the Free
 *   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

package org.jacorb.orb;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import org.omg.CONV_FRAME.CodeSetComponent;
import org.omg.CONV_FRAME.CodeSetComponentInfo;
import org.omg.CORBA.CODESET_INCOMPATIBLE;
import org.omg.CORBA.MARSHAL;
/**
 * @author Gerald Brose
 */
public class CodeSet
{
    static final String CODESET_PREFIX = "0x00000000";

    /**
     * <code>ASCII</code> represents the base 7-bits of ISO8859_1
     */
    static final CodeSet ASCII_CODESET = new AsciiCodeSet();

    /**
     * <code>ISO8859_1</code> represents the default 8-bit codeset.
     * It is ISO 8859-1:1987; Latin Alphabet No. 1
     */
    static final CodeSet ISO8859_1_CODESET = new Iso8859_1CodeSet();

    /**
     * <code>ISO8859_15</code> represents Latin Alphabet No. 9
     */
    static final CodeSet ISO8859_15_CODESET = new Iso8859_15CodeSet();

    /**
     * <code>UTF8</code> represents UTF8 1-6 bytes for every character
     * X/Open UTF-8; UCS Transformation Format 8 (UTF-8)
     */
    static final CodeSet UTF8_CODESET = new Utf8CodeSet();

    /**
     * <code>UTF16</code> represents extended UCS2, 2 or 4 bytes for every char
     * ISO/IEC 10646-1:1993; UTF-16, UCS Transformation Format 16-bit form
     */
    static final CodeSet UTF16_CODESET = new Utf16CodeSet();

    /**
     * <code>UCS2</code> represents UCS2, 2bytes for every char
     * ISO/IEC 10646-1:1993; UTF-16, UCS Transformation Format 16-bit form
     */
    static final CodeSet UCS2_CODESET = new Ucs2CodeSet();

    static final CodeSet MAC_ROMAN_CODESET = new MacRomanCodeSet();

    /**
     * All of the encodings supported by Jacorb. These should be listed in order of preference.
     */
    static final CodeSet[] KNOWN_ENCODINGS = { ISO8859_1_CODESET, ISO8859_15_CODESET, UTF16_CODESET, UTF8_CODESET, UCS2_CODESET, MAC_ROMAN_CODESET, ASCII_CODESET };

    /**
     * The default JVM platform encoding.
     */
    static final String DEFAULT_PLATFORM_ENCODING;

    /**
     * A 'null object' code set instance, used when no matching codeset is found.
     */
    static final CodeSet NULL_CODE_SET = new CodeSet( -1, "NO SUCH CODESET" );



    static
    {
        // See http://java.sun.com/j2se/1.4.1/docs/guide/intl/encoding.doc.html for
        // a list of encodings and their canonical names.
        //
        // http://developer.java.sun.com/developer/bugParade/bugs/4772857.html
        //
        // This allows me to get the actual canonical name of the encoding as the
        // System property may differ depending upon locale and OS.
        OutputStreamWriter defaultStream = new OutputStreamWriter( new ByteArrayOutputStream() );
        DEFAULT_PLATFORM_ENCODING = defaultStream.getEncoding();
        try
        {
            defaultStream.close();
        }
        catch( IOException e )
        {
        }
    }


    /** The standard CORBA identifier associated with this code set; used during negotiation. */
    private int id;

    /** The canonical name of this code set. */
    private String name;

    /** Identify this codeset as a local alias of some shared codeset and thus not to be added to the IOR */
    boolean isAlias;

    /**
     * Convert the CORBA standard id to a String name.
     *
     * @param cs
     * @return
     */
    public static String csName(int cs)
    {
        for (int i = 0; i < KNOWN_ENCODINGS.length; i++)
        {
            if (cs == KNOWN_ENCODINGS[i].getId()) return KNOWN_ENCODINGS[i].getName();
        }
        return "Unknown TCS: 0x" + Integer.toHexString(cs);
    }


    /**
     * Returns the code set which matches the specified name, which should either be the canonical name of
     * a supported encoding or the hex representation of its ID.
     * @param name the string used to select a codeset.
     * @return the matching code set or NULL_CODE_SET if there are no matches.
     */
    public static CodeSet getCodeSet( String name ) {
        String ucName = name.toUpperCase();
        for (int i = 0; i < KNOWN_ENCODINGS.length; i++)
        {
            CodeSet codeset = KNOWN_ENCODINGS[i];
            if (codeset.getName().equals( ucName )) return codeset;
        }

        try
        {
            int id = Integer.parseInt( name, 16 );
            for (int i = 0; i < KNOWN_ENCODINGS.length; i++)
            {
                CodeSet codeset = KNOWN_ENCODINGS[i];
                if (id == codeset.getId()) return codeset;
            }
            return NULL_CODE_SET;
        }
        catch (NumberFormatException ex)
        {
            return NULL_CODE_SET;
        }
    }


    /**
     * Returns the code set which matches the specified ID.
     * @return the matching code set or NULL_CODE_SET if there are no matches.
     */
    public static CodeSet getCodeSet( int id ) {
        for (int i = 0; i < KNOWN_ENCODINGS.length; i++)
        {
            CodeSet codeset = KNOWN_ENCODINGS[i];
            if (id == codeset.id) return codeset;
        }
        return NULL_CODE_SET;
    }


    public static CodeSet getNegotiatedCodeSet( ORB orb, CodeSetComponentInfo serverCodeSetInfo, boolean wide )
    {
        return getMatchingCodeSet( getSelectedComponent( orb.getLocalCodeSetComponentInfo(), wide ),
                                   getSelectedComponent( serverCodeSetInfo, wide ),
                                   wide );
    }


    static CodeSetComponent createCodeSetComponent( boolean wide, CodeSet nativeCodeSet )
    {
        ArrayList<CodeSet> codeSets = new ArrayList<CodeSet>();
        codeSets.add( nativeCodeSet );
        for (int i = 0; i < KNOWN_ENCODINGS.length; i++)
        {
            if (KNOWN_ENCODINGS[i].supportsCharacterData( wide ) && !codeSets.contains( KNOWN_ENCODINGS[i] ))
            {
                if (!KNOWN_ENCODINGS[i].isAlias)
                    codeSets.add( KNOWN_ENCODINGS[i] );
            }
        }
        int nativeSet = codeSets.remove( 0 ).getId();
        int[] conversionSets = new int[codeSets.size()];
        for (int i = 0; i < conversionSets.length; i++)
        {
            conversionSets[i] = codeSets.get(i).getId();
        }
        return new CodeSetComponent( nativeSet, conversionSets );
    }


    public static CodeSet getMatchingCodeSet( CodeSetComponent local, CodeSetComponent remote, boolean wide )
    {
        CodeSet codeSet = getCodeSetIfMatched( local.native_code_set, remote );
        if (codeSet != null) return codeSet;

        for (int i = 0; i < local.conversion_code_sets.length; i++)
        {
            codeSet = getCodeSetIfMatched( local.conversion_code_sets[i], remote );
            if (codeSet != null) return codeSet;
        }

        return reportNegotiationFailure( local, remote, wide );
    }


    public static CodeSet getCodeSetIfMatched( int localCodeSetId, CodeSetComponent remote )
    {
        if (localCodeSetId == remote.native_code_set)
        {
            return getCodeSet( localCodeSetId );
        }
        else
        {
            for (int i = 0; i < remote.conversion_code_sets.length; i++)
            {
                if (localCodeSetId == remote.conversion_code_sets[i])
                {
                    return getCodeSet( localCodeSetId );
                }
            }
        }
        return null;
    }


    private static CodeSet reportNegotiationFailure( CodeSetComponent local, CodeSetComponent remote, boolean wide )
    {
        StringBuffer sb = new StringBuffer( "No matching ");
        if (wide) sb.append( "wide " );
        sb.append( "code set found. Client knows {" );
        appendCodeSetList( sb, local );
        sb.append( "}. Server offered {" );
        appendCodeSetList( sb, remote );
        sb.append( '}' );
        throw new CODESET_INCOMPATIBLE( sb.toString() );
    }


    private static void appendCodeSetList( StringBuffer sb, CodeSetComponent remote )
    {
        int code_set = remote.native_code_set;
        sb.append( toCodeSetString( code_set ) );
        for (int i = 0; i < remote.conversion_code_sets.length; i++) {
            sb.append( ',' ).append( toCodeSetString( remote.conversion_code_sets[i] ) );
        }
    }


    private static String toCodeSetString( int code_set )
    {
        String rawString = Integer.toHexString( code_set );
        return CODESET_PREFIX.substring( 0, CODESET_PREFIX.length() - rawString.length() ) + rawString;
    }


    private static CodeSetComponent getSelectedComponent( CodeSetComponentInfo info, boolean wide )
    {
        return wide ? info.ForWcharData : info.ForCharData;
    }


    /**
     * This interface represents a buffer from which character data can be read.
     */
    public static interface InputBuffer
    {

        /**  Reads the next byte from an in-memory buffer. */
        public byte readByte();

        /** Returns the current position in the buffer. */
        int get_pos();


        /**
         * Looks ahead in the buffer to see if a byte-order marker is present. If so, reads it from the buffer
         * and returns the result.
         * @return true if a marker indicating little-endian was read.
         */
        boolean readBOM();
    }


    /**
     * Represents a buffer to which character data may be written.
     */
    public static interface OutputBuffer
    {
        /**
         * Writes the specified byte to the buffer.
         * @param b the byte to write
         */
        void write_byte( byte b );

        /**
         * Forces short (2-byte) alignment and writes the specified value to the buffer.
         * @param value the value to write.
         */
        void write_short( short value );

        /**
         * Write an array of bytes to the buffer
         * @param b
         * @param offset
         * @param length
         */
        void write_octet_array(byte []b, int offset, int length);
    }


    public CodeSet( int id, String name )
    {
        this.id = id;
        this.name = name;
        this.isAlias = false;
    }


    /**
     * Returns true if this codeset supports the specified character type.
     * @param wide
     */
    public boolean supportsCharacterData( boolean wide )
    {
        return false;
    }


    /**
     * Returns true if this codeset supports multie-byte characters
     */
    public boolean supportsWideCharacterData()
    {
        return false;
    }


    /**
     * Returns the CORBA-standard id for this code set.
     */
    public int getId()
    {
        return id;
    }


    /**
     * Returns the canonical name of this code set.
     */
    public String getName()
    {
        return name;
    }

    @Override
    public String toString ()
    {
        return getName();
    }


    /**
     * Returns true if this code set requires byte-order-markers to be written to the beginning of a stream of text.
     * @param configuredForBom true if the orb has been configured to write byte-order-markers.
     */
    public boolean write_bom( boolean configuredForBom )
    {
        return false;
    }


    /**
     * Reads a wide character from the specified buffer.
     * @param buffer the buffer containing the data.
     * @param giop_minor the low-order byte of the giop version (1.x is assumed)
     * @param littleEndian true if the character is to be read low end first
     * @return the wide character.
     */
    public char read_wchar( InputBuffer buffer, int giop_minor, boolean littleEndian )
    {
        throw new MARSHAL( "Bad wide char codeSet: " + getName() );
    }


    /**
     * Reads a wide string from the buffer. The length indicator is presumed already to have been read.
     * @param buffer          the buffer from which to read the string
     * @param lengthIndicator the length indicator already read
     * @param giop_minor      the low-order byte of the giop version (1.x is assumed)
     * @param littleEndian    true if the characters are to be read low end first
     * @return a string possibly containing wide characters.
     */
    public String read_wstring( InputBuffer buffer, int lengthIndicator, int giop_minor, boolean littleEndian )
    {
        throw new MARSHAL( "Bad wide char codeSet: " + getName() );
    }


    /**
     * Writes a character to the buffer with the appropriate encoding.
     * @param buffer       the buffer to which the character is written
     * @param c            the character to write
     * @param write_bom    true if a byte-order-marker (indicating big-endian) should be written
     * @param write_length true if the length of the character should be written
     * @param giop_minor   the low-order byte of the giop version (1.x is assumed)
     */
    public void write_char( OutputBuffer buffer, char c, boolean write_bom, boolean write_length, int giop_minor )
    {
        throw new CODESET_INCOMPATIBLE("Bad codeset: " + getName() );
    }

    /**
     * Writes a sting to the buffer with the appropriate encoding.
     * @param buffer       the buffer to which the string is written
     * @param s            the string to write
     * @param write_bom    true if a byte-order-marker (indicating big-endian) should be written
     * @param write_length true if the length of the character should be written
     * @param giop_minor   the low-order byte of the giop version (1.x is assumed)
     */
    public void write_string(OutputBuffer buffer, String s, boolean write_bom, boolean write_length, int giop_minor)
    {
        for (int i = 0; i < s.length(); i++)
        {
            this.write_char(buffer, s.charAt(i), write_bom, write_length, giop_minor);
        }
    }

    /**
     * Returns the length of the string just written to the buffer.
     * @param string     the string written
     * @param startPos   the starting position at which the string was written
     * @param currentPos the current buffer position
     */
    public int get_wstring_size( String string, int startPos, int currentPos )
    {
        return 0;
    }


    /**
     * Reads a wide string from the buffer according to GIOP 1.2. The length indicator is presumed already to have been read.
     * @param buffer          the buffer from which to read the string
     * @param size            the length indicator already read
     * @param giop_minor      the low-order byte of the giop version (must be >= 2)
     * @return a string possibly containing wide characters.
     */
    final String readGiop12WString( InputBuffer buffer, int size, int giop_minor )
    {
        char buf[] = new char[ size ];
        int endPos = buffer.get_pos() + size;

        boolean wchar_litte_endian = buffer.readBOM();

        int i = 0;
        while( buffer.get_pos() < endPos )
        {
            buf[ i++ ] = read_wchar( buffer, giop_minor, wchar_litte_endian );
        }

        return new String( buf, 0, i );
    }


    static private class Iso8859_1CodeSet extends CodeSet {

        private Iso8859_1CodeSet()
        {
            super( 0x00010001, "ISO8859_1" );
        }


        /**
         * Only used for derived codesets
         */
        Iso8859_1CodeSet(int i, String name)
        {
            super( i, name);
        }


        /**
         * Returns true if 'wide' is not specified.
         */
        @Override
        public boolean supportsCharacterData( boolean wide )
        {
            return !wide;
        }


        @Override
        public void write_char( OutputBuffer buffer, char c, boolean write_bom, boolean write_length_indicator, int giop_minor )
        {
            buffer.write_byte( (byte) c );
        }
    }

    static private class AsciiCodeSet extends Iso8859_1CodeSet {

        private AsciiCodeSet()
        {
            super( 0x00010001, "ASCII" );
            this.isAlias = true;
        }


        /**
         * Only used for derived codesets
         */
        AsciiCodeSet(int i, String name)
        {
            super( i, name);
            this.isAlias = true;
        }

    }

    static private class MacRomanCodeSet extends Iso8859_1CodeSet {

        private MacRomanCodeSet()
        {
            super( 0x00010001, "MacRoman" );
            this.isAlias = true;
        }


        /**
         * Only used for derived codesets
         */
        MacRomanCodeSet(int i, String name)
        {
            super( i, name);
            this.isAlias = true;
        }
    }

    static private class Iso8859_15CodeSet extends Iso8859_1CodeSet {

        private Iso8859_15CodeSet()
        {
            super( 0x0001000F, "ISO8859_15" );
        }

        @Override
        public void write_char( OutputBuffer buffer, char c, boolean write_bom, boolean write_length_indicator, int giop_minor )
        {
            switch (c)
            {
                case '\u20AC':
                {
                    buffer.write_byte((byte) 0xA4);
                    break;
                }
                case '\u0160':
                {
                    buffer.write_byte((byte) 0xA6);
                    break;
                }
                case '\u0161':
                {
                    buffer.write_byte((byte) 0xA8);
                    break;
                }
                case '\u017D':
                {
                    buffer.write_byte((byte) 0xB4);
                    break;
                }
                case '\u017E':
                {
                    buffer.write_byte((byte) 0xB8);
                    break;
                }
                case '\u0152':
                {
                    buffer.write_byte((byte) 0xBC);
                    break;
                }
                case '\u0153':
                {
                    buffer.write_byte((byte) 0xBD);
                    break;
                }
                case '\u0178':
                {
                    buffer.write_byte((byte) 0xBE);
                    break;
                }
                default:
                {
                    super.write_char (buffer, c, write_bom, write_length_indicator, giop_minor);
                }
            }
        }
    }


    static private class Utf8CodeSet extends CodeSet {

        private Utf8CodeSet( )
        {
            super( 0x05010001, "UTF8" );
        }


        /**
         * Returns true for both wide and non-wide characters.
         */
        @Override
        public boolean supportsCharacterData( boolean wide )
        {
            return true;
        }


        @Override
        public char read_wchar( InputBuffer buffer, int giop_minor, boolean littleEndian )
        {
            if (giop_minor < 2)
            {
                throw new MARSHAL( "GIOP 1." + giop_minor +
                                   " only allows 2 Byte encodings for wchar, but the selected TCSW is UTF-8" );
            }

            short value = (short) (0xff & buffer.readByte());

            if ((value & 0x80) == 0)
            {
                return (char) value;
            }
            else if ((value & 0xe0) == 0xc0)
            {
                return (char) (((value & 0x1F) << 6) | (buffer.readByte() & 0x3F));
            }
            else
            {
                short b2 = (short) (0xff & buffer.readByte());
                return (char) (((value & 0x0F) << 12) | ((b2 & 0x3F) << 6) | (buffer.readByte() & 0x3F));
            }
        }


        @Override
        public String read_wstring( InputBuffer source, int lengthIndicator, int giop_minor, boolean little_endian )
        {
            if (giop_minor < 2) throw new MARSHAL( "Bad wide char codeSet: " + getName() );
            return readGiop12WString( source, lengthIndicator, giop_minor );
        }


        @Override
        public void write_char( OutputBuffer buffer, char c, boolean write_bom, boolean write_length_indicator, int giop_minor )
        {
            if( c <= 0x007F )
            {
                if( giop_minor == 2 && write_length_indicator )
                {
                    //the chars length in bytes
                    buffer.write_byte( (byte) 1);
                }

                buffer.write_byte( (byte) c );
            }
            else if( c > 0x07FF )
            {
                if( giop_minor == 2 && write_length_indicator )
                {
                    //the chars length in bytes
                    buffer.write_byte( (byte) 3 );
                }

                buffer.write_byte( (byte)(0xE0 | ((c >> 12) & 0x0F)) );
                buffer.write_byte( (byte)(0x80 | ((c >>  6) & 0x3F)) );
                buffer.write_byte( (byte)(0x80 | ((c >>  0) & 0x3F)) );
            }
            else
            {
                if( giop_minor == 2 && write_length_indicator )
                {
                    buffer.write_byte( (byte) 2 );   //the chars length in bytes
                }

                buffer.write_byte( (byte)(0xC0 | ((c >>  6) & 0x1F)) );
                buffer.write_byte( (byte)(0x80 | ((c >>  0) & 0x3F)) );
            }
        }

        @Override
        public void write_string( OutputBuffer buffer, String s, boolean write_bom, boolean write_length, int giop_minor )
        {
            try
            {
                byte[] bytes = s.getBytes(this.getName());
                buffer.write_octet_array(bytes, 0, bytes.length);
            }
            catch (UnsupportedEncodingException e)
            {
                throw new CODESET_INCOMPATIBLE("Bad codeset: " + getName());
            }
        }

        @Override
        public int get_wstring_size( String s, int startPos, int currentPos )
        {
            return currentPos - startPos - 4;
        }
    }


    static abstract private class TwoByteCodeSet extends CodeSet
    {

        TwoByteCodeSet( int id, String name )
        {
            super( id, name );
        }


        /**
         * Returns true if the character type specified is 'wide'.
         */
        @Override
        public boolean supportsCharacterData( boolean wide )
        {
            return wide;
        }


        @Override
        public char read_wchar( InputBuffer buffer, int giop_minor, boolean littleEndian )
        {
            if (littleEndian)
            {
                return (char) ((buffer.readByte() & 0xFF) | (buffer.readByte() << 8));
            }
            else
            {
                return (char) ((buffer.readByte() << 8) | (buffer.readByte() & 0xFF));
            }
        }


        @Override
        public String read_wstring( InputBuffer source, int lengthIndicator, int giop_minor, boolean little_endian )
        {
            if( giop_minor == 2 )
            {
                return readGiop12WString( source, lengthIndicator, giop_minor );
            }
            else //GIOP 1.1 / 1.0 : length indicates number of 2-byte characters
            {
                char buf[] = new char[lengthIndicator];
                int endPos = source.get_pos() + 2* lengthIndicator;

                int i = 0;
                while( source.get_pos() < endPos )
                {
                    buf[ i++ ] = read_wchar( source, giop_minor, little_endian );
                }

                if( (i != 0) && (buf[ i - 1 ] == 0) ) //don't return terminating NUL
                {
                    return new String( buf, 0, i - 1 );
                }
                else   //doesn't have a terminating NUL. This is actually not allowed
                {
                    return new String( buf, 0, i );
                }
            }
        }


        @Override
        public void write_char( OutputBuffer buffer, char c, boolean write_bom, boolean write_length_indicator, int giop_minor )
        {
            if (giop_minor < 2)
            {
                buffer.write_short( (short) c );   //UTF-16 char is treated as an ushort (write aligned)
            }
            else
            {
                if (write_length_indicator)  //the chars length in bytes
                {
                    buffer.write_byte( (byte) 2 );
                }

                if (write_bom) //big endian encoding
                {

                    buffer.write_byte( (byte) 0xFE );
                    buffer.write_byte( (byte) 0xFF );
                }

                //write unaligned
                buffer.write_byte( (byte)((c >> 8) & 0xFF) );
                buffer.write_byte( (byte) (c       & 0xFF) );
            }
        }

        @Override
        public int get_wstring_size( String s, int startPos, int currentPos )
        {
            return s.length() + 1;   // size in chars (+ NUL char)
        }
    }


    static private class Utf16CodeSet extends TwoByteCodeSet
    {

        private Utf16CodeSet()
        {
            super( 0x00010109, "UTF16" );
        }

        /** Returns the configured value to use BOMs only when specifically configured to do so. */
        @Override
        public boolean write_bom( boolean configuredForBom )
        {
            return false;
        }
    }


    /*
     * According to:
     * http://www.omg.org/issues/issue4008.txt
     * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4506930
     * UCS-2 should not write a BOM.
     */
    static private class Ucs2CodeSet extends TwoByteCodeSet
    {
        private Ucs2CodeSet()
        {
            super( 0x00010100, "UCS2" );
        }
    }
}