/*
* JacORB - a free Java ORB
*
* Copyright (C) 1997-2014 Gerald Brose / The JacORB Team.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
package org.jacorb.orb;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import org.omg.CONV_FRAME.CodeSetComponent;
import org.omg.CONV_FRAME.CodeSetComponentInfo;
import org.omg.CORBA.CODESET_INCOMPATIBLE;
import org.omg.CORBA.MARSHAL;
/**
* @author Gerald Brose
*/
public class CodeSet
{
static final String CODESET_PREFIX = "0x00000000";
/**
* <code>ASCII</code> represents the base 7-bits of ISO8859_1
*/
static final CodeSet ASCII_CODESET = new AsciiCodeSet();
/**
* <code>ISO8859_1</code> represents the default 8-bit codeset.
* It is ISO 8859-1:1987; Latin Alphabet No. 1
*/
static final CodeSet ISO8859_1_CODESET = new Iso8859_1CodeSet();
/**
* <code>ISO8859_15</code> represents Latin Alphabet No. 9
*/
static final CodeSet ISO8859_15_CODESET = new Iso8859_15CodeSet();
/**
* <code>UTF8</code> represents UTF8 1-6 bytes for every character
* X/Open UTF-8; UCS Transformation Format 8 (UTF-8)
*/
static final CodeSet UTF8_CODESET = new Utf8CodeSet();
/**
* <code>UTF16</code> represents extended UCS2, 2 or 4 bytes for every char
* ISO/IEC 10646-1:1993; UTF-16, UCS Transformation Format 16-bit form
*/
static final CodeSet UTF16_CODESET = new Utf16CodeSet();
/**
* <code>UCS2</code> represents UCS2, 2bytes for every char
* ISO/IEC 10646-1:1993; UTF-16, UCS Transformation Format 16-bit form
*/
static final CodeSet UCS2_CODESET = new Ucs2CodeSet();
static final CodeSet MAC_ROMAN_CODESET = new MacRomanCodeSet();
/**
* All of the encodings supported by Jacorb. These should be listed in order of preference.
*/
static final CodeSet[] KNOWN_ENCODINGS = { ISO8859_1_CODESET, ISO8859_15_CODESET, UTF16_CODESET, UTF8_CODESET, UCS2_CODESET, MAC_ROMAN_CODESET, ASCII_CODESET };
/**
* The default JVM platform encoding.
*/
static final String DEFAULT_PLATFORM_ENCODING;
/**
* A 'null object' code set instance, used when no matching codeset is found.
*/
static final CodeSet NULL_CODE_SET = new CodeSet( -1, "NO SUCH CODESET" );
static
{
// See http://java.sun.com/j2se/1.4.1/docs/guide/intl/encoding.doc.html for
// a list of encodings and their canonical names.
//
// http://developer.java.sun.com/developer/bugParade/bugs/4772857.html
//
// This allows me to get the actual canonical name of the encoding as the
// System property may differ depending upon locale and OS.
OutputStreamWriter defaultStream = new OutputStreamWriter( new ByteArrayOutputStream() );
DEFAULT_PLATFORM_ENCODING = defaultStream.getEncoding();
try
{
defaultStream.close();
}
catch( IOException e )
{
}
}
/** The standard CORBA identifier associated with this code set; used during negotiation. */
private int id;
/** The canonical name of this code set. */
private String name;
/** Identify this codeset as a local alias of some shared codeset and thus not to be added to the IOR */
boolean isAlias;
/**
* Convert the CORBA standard id to a String name.
*
* @param cs
* @return
*/
public static String csName(int cs)
{
for (int i = 0; i < KNOWN_ENCODINGS.length; i++)
{
if (cs == KNOWN_ENCODINGS[i].getId()) return KNOWN_ENCODINGS[i].getName();
}
return "Unknown TCS: 0x" + Integer.toHexString(cs);
}
/**
* Returns the code set which matches the specified name, which should either be the canonical name of
* a supported encoding or the hex representation of its ID.
* @param name the string used to select a codeset.
* @return the matching code set or NULL_CODE_SET if there are no matches.
*/
public static CodeSet getCodeSet( String name ) {
String ucName = name.toUpperCase();
for (int i = 0; i < KNOWN_ENCODINGS.length; i++)
{
CodeSet codeset = KNOWN_ENCODINGS[i];
if (codeset.getName().equals( ucName )) return codeset;
}
try
{
int id = Integer.parseInt( name, 16 );
for (int i = 0; i < KNOWN_ENCODINGS.length; i++)
{
CodeSet codeset = KNOWN_ENCODINGS[i];
if (id == codeset.getId()) return codeset;
}
return NULL_CODE_SET;
}
catch (NumberFormatException ex)
{
return NULL_CODE_SET;
}
}
/**
* Returns the code set which matches the specified ID.
* @return the matching code set or NULL_CODE_SET if there are no matches.
*/
public static CodeSet getCodeSet( int id ) {
for (int i = 0; i < KNOWN_ENCODINGS.length; i++)
{
CodeSet codeset = KNOWN_ENCODINGS[i];
if (id == codeset.id) return codeset;
}
return NULL_CODE_SET;
}
public static CodeSet getNegotiatedCodeSet( ORB orb, CodeSetComponentInfo serverCodeSetInfo, boolean wide )
{
return getMatchingCodeSet( getSelectedComponent( orb.getLocalCodeSetComponentInfo(), wide ),
getSelectedComponent( serverCodeSetInfo, wide ),
wide );
}
static CodeSetComponent createCodeSetComponent( boolean wide, CodeSet nativeCodeSet )
{
ArrayList<CodeSet> codeSets = new ArrayList<CodeSet>();
codeSets.add( nativeCodeSet );
for (int i = 0; i < KNOWN_ENCODINGS.length; i++)
{
if (KNOWN_ENCODINGS[i].supportsCharacterData( wide ) && !codeSets.contains( KNOWN_ENCODINGS[i] ))
{
if (!KNOWN_ENCODINGS[i].isAlias)
codeSets.add( KNOWN_ENCODINGS[i] );
}
}
int nativeSet = codeSets.remove( 0 ).getId();
int[] conversionSets = new int[codeSets.size()];
for (int i = 0; i < conversionSets.length; i++)
{
conversionSets[i] = codeSets.get(i).getId();
}
return new CodeSetComponent( nativeSet, conversionSets );
}
public static CodeSet getMatchingCodeSet( CodeSetComponent local, CodeSetComponent remote, boolean wide )
{
CodeSet codeSet = getCodeSetIfMatched( local.native_code_set, remote );
if (codeSet != null) return codeSet;
for (int i = 0; i < local.conversion_code_sets.length; i++)
{
codeSet = getCodeSetIfMatched( local.conversion_code_sets[i], remote );
if (codeSet != null) return codeSet;
}
return reportNegotiationFailure( local, remote, wide );
}
public static CodeSet getCodeSetIfMatched( int localCodeSetId, CodeSetComponent remote )
{
if (localCodeSetId == remote.native_code_set)
{
return getCodeSet( localCodeSetId );
}
else
{
for (int i = 0; i < remote.conversion_code_sets.length; i++)
{
if (localCodeSetId == remote.conversion_code_sets[i])
{
return getCodeSet( localCodeSetId );
}
}
}
return null;
}
private static CodeSet reportNegotiationFailure( CodeSetComponent local, CodeSetComponent remote, boolean wide )
{
StringBuffer sb = new StringBuffer( "No matching ");
if (wide) sb.append( "wide " );
sb.append( "code set found. Client knows {" );
appendCodeSetList( sb, local );
sb.append( "}. Server offered {" );
appendCodeSetList( sb, remote );
sb.append( '}' );
throw new CODESET_INCOMPATIBLE( sb.toString() );
}
private static void appendCodeSetList( StringBuffer sb, CodeSetComponent remote )
{
int code_set = remote.native_code_set;
sb.append( toCodeSetString( code_set ) );
for (int i = 0; i < remote.conversion_code_sets.length; i++) {
sb.append( ',' ).append( toCodeSetString( remote.conversion_code_sets[i] ) );
}
}
private static String toCodeSetString( int code_set )
{
String rawString = Integer.toHexString( code_set );
return CODESET_PREFIX.substring( 0, CODESET_PREFIX.length() - rawString.length() ) + rawString;
}
private static CodeSetComponent getSelectedComponent( CodeSetComponentInfo info, boolean wide )
{
return wide ? info.ForWcharData : info.ForCharData;
}
/**
* This interface represents a buffer from which character data can be read.
*/
public static interface InputBuffer
{
/** Reads the next byte from an in-memory buffer. */
public byte readByte();
/** Returns the current position in the buffer. */
int get_pos();
/**
* Looks ahead in the buffer to see if a byte-order marker is present. If so, reads it from the buffer
* and returns the result.
* @return true if a marker indicating little-endian was read.
*/
boolean readBOM();
}
/**
* Represents a buffer to which character data may be written.
*/
public static interface OutputBuffer
{
/**
* Writes the specified byte to the buffer.
* @param b the byte to write
*/
void write_byte( byte b );
/**
* Forces short (2-byte) alignment and writes the specified value to the buffer.
* @param value the value to write.
*/
void write_short( short value );
/**
* Write an array of bytes to the buffer
* @param b
* @param offset
* @param length
*/
void write_octet_array(byte []b, int offset, int length);
}
public CodeSet( int id, String name )
{
this.id = id;
this.name = name;
this.isAlias = false;
}
/**
* Returns true if this codeset supports the specified character type.
* @param wide
*/
public boolean supportsCharacterData( boolean wide )
{
return false;
}
/**
* Returns true if this codeset supports multie-byte characters
*/
public boolean supportsWideCharacterData()
{
return false;
}
/**
* Returns the CORBA-standard id for this code set.
*/
public int getId()
{
return id;
}
/**
* Returns the canonical name of this code set.
*/
public String getName()
{
return name;
}
@Override
public String toString ()
{
return getName();
}
/**
* Returns true if this code set requires byte-order-markers to be written to the beginning of a stream of text.
* @param configuredForBom true if the orb has been configured to write byte-order-markers.
*/
public boolean write_bom( boolean configuredForBom )
{
return false;
}
/**
* Reads a wide character from the specified buffer.
* @param buffer the buffer containing the data.
* @param giop_minor the low-order byte of the giop version (1.x is assumed)
* @param littleEndian true if the character is to be read low end first
* @return the wide character.
*/
public char read_wchar( InputBuffer buffer, int giop_minor, boolean littleEndian )
{
throw new MARSHAL( "Bad wide char codeSet: " + getName() );
}
/**
* Reads a wide string from the buffer. The length indicator is presumed already to have been read.
* @param buffer the buffer from which to read the string
* @param lengthIndicator the length indicator already read
* @param giop_minor the low-order byte of the giop version (1.x is assumed)
* @param littleEndian true if the characters are to be read low end first
* @return a string possibly containing wide characters.
*/
public String read_wstring( InputBuffer buffer, int lengthIndicator, int giop_minor, boolean littleEndian )
{
throw new MARSHAL( "Bad wide char codeSet: " + getName() );
}
/**
* Writes a character to the buffer with the appropriate encoding.
* @param buffer the buffer to which the character is written
* @param c the character to write
* @param write_bom true if a byte-order-marker (indicating big-endian) should be written
* @param write_length true if the length of the character should be written
* @param giop_minor the low-order byte of the giop version (1.x is assumed)
*/
public void write_char( OutputBuffer buffer, char c, boolean write_bom, boolean write_length, int giop_minor )
{
throw new CODESET_INCOMPATIBLE("Bad codeset: " + getName() );
}
/**
* Writes a sting to the buffer with the appropriate encoding.
* @param buffer the buffer to which the string is written
* @param s the string to write
* @param write_bom true if a byte-order-marker (indicating big-endian) should be written
* @param write_length true if the length of the character should be written
* @param giop_minor the low-order byte of the giop version (1.x is assumed)
*/
public void write_string(OutputBuffer buffer, String s, boolean write_bom, boolean write_length, int giop_minor)
{
for (int i = 0; i < s.length(); i++)
{
this.write_char(buffer, s.charAt(i), write_bom, write_length, giop_minor);
}
}
/**
* Returns the length of the string just written to the buffer.
* @param string the string written
* @param startPos the starting position at which the string was written
* @param currentPos the current buffer position
*/
public int get_wstring_size( String string, int startPos, int currentPos )
{
return 0;
}
/**
* Reads a wide string from the buffer according to GIOP 1.2. The length indicator is presumed already to have been read.
* @param buffer the buffer from which to read the string
* @param size the length indicator already read
* @param giop_minor the low-order byte of the giop version (must be >= 2)
* @return a string possibly containing wide characters.
*/
final String readGiop12WString( InputBuffer buffer, int size, int giop_minor )
{
char buf[] = new char[ size ];
int endPos = buffer.get_pos() + size;
boolean wchar_litte_endian = buffer.readBOM();
int i = 0;
while( buffer.get_pos() < endPos )
{
buf[ i++ ] = read_wchar( buffer, giop_minor, wchar_litte_endian );
}
return new String( buf, 0, i );
}
static private class Iso8859_1CodeSet extends CodeSet {
private Iso8859_1CodeSet()
{
super( 0x00010001, "ISO8859_1" );
}
/**
* Only used for derived codesets
*/
Iso8859_1CodeSet(int i, String name)
{
super( i, name);
}
/**
* Returns true if 'wide' is not specified.
*/
@Override
public boolean supportsCharacterData( boolean wide )
{
return !wide;
}
@Override
public void write_char( OutputBuffer buffer, char c, boolean write_bom, boolean write_length_indicator, int giop_minor )
{
buffer.write_byte( (byte) c );
}
}
static private class AsciiCodeSet extends Iso8859_1CodeSet {
private AsciiCodeSet()
{
super( 0x00010001, "ASCII" );
this.isAlias = true;
}
/**
* Only used for derived codesets
*/
AsciiCodeSet(int i, String name)
{
super( i, name);
this.isAlias = true;
}
}
static private class MacRomanCodeSet extends Iso8859_1CodeSet {
private MacRomanCodeSet()
{
super( 0x00010001, "MacRoman" );
this.isAlias = true;
}
/**
* Only used for derived codesets
*/
MacRomanCodeSet(int i, String name)
{
super( i, name);
this.isAlias = true;
}
}
static private class Iso8859_15CodeSet extends Iso8859_1CodeSet {
private Iso8859_15CodeSet()
{
super( 0x0001000F, "ISO8859_15" );
}
@Override
public void write_char( OutputBuffer buffer, char c, boolean write_bom, boolean write_length_indicator, int giop_minor )
{
switch (c)
{
case '\u20AC':
{
buffer.write_byte((byte) 0xA4);
break;
}
case '\u0160':
{
buffer.write_byte((byte) 0xA6);
break;
}
case '\u0161':
{
buffer.write_byte((byte) 0xA8);
break;
}
case '\u017D':
{
buffer.write_byte((byte) 0xB4);
break;
}
case '\u017E':
{
buffer.write_byte((byte) 0xB8);
break;
}
case '\u0152':
{
buffer.write_byte((byte) 0xBC);
break;
}
case '\u0153':
{
buffer.write_byte((byte) 0xBD);
break;
}
case '\u0178':
{
buffer.write_byte((byte) 0xBE);
break;
}
default:
{
super.write_char (buffer, c, write_bom, write_length_indicator, giop_minor);
}
}
}
}
static private class Utf8CodeSet extends CodeSet {
private Utf8CodeSet( )
{
super( 0x05010001, "UTF8" );
}
/**
* Returns true for both wide and non-wide characters.
*/
@Override
public boolean supportsCharacterData( boolean wide )
{
return true;
}
@Override
public char read_wchar( InputBuffer buffer, int giop_minor, boolean littleEndian )
{
if (giop_minor < 2)
{
throw new MARSHAL( "GIOP 1." + giop_minor +
" only allows 2 Byte encodings for wchar, but the selected TCSW is UTF-8" );
}
short value = (short) (0xff & buffer.readByte());
if ((value & 0x80) == 0)
{
return (char) value;
}
else if ((value & 0xe0) == 0xc0)
{
return (char) (((value & 0x1F) << 6) | (buffer.readByte() & 0x3F));
}
else
{
short b2 = (short) (0xff & buffer.readByte());
return (char) (((value & 0x0F) << 12) | ((b2 & 0x3F) << 6) | (buffer.readByte() & 0x3F));
}
}
@Override
public String read_wstring( InputBuffer source, int lengthIndicator, int giop_minor, boolean little_endian )
{
if (giop_minor < 2) throw new MARSHAL( "Bad wide char codeSet: " + getName() );
return readGiop12WString( source, lengthIndicator, giop_minor );
}
@Override
public void write_char( OutputBuffer buffer, char c, boolean write_bom, boolean write_length_indicator, int giop_minor )
{
if( c <= 0x007F )
{
if( giop_minor == 2 && write_length_indicator )
{
//the chars length in bytes
buffer.write_byte( (byte) 1);
}
buffer.write_byte( (byte) c );
}
else if( c > 0x07FF )
{
if( giop_minor == 2 && write_length_indicator )
{
//the chars length in bytes
buffer.write_byte( (byte) 3 );
}
buffer.write_byte( (byte)(0xE0 | ((c >> 12) & 0x0F)) );
buffer.write_byte( (byte)(0x80 | ((c >> 6) & 0x3F)) );
buffer.write_byte( (byte)(0x80 | ((c >> 0) & 0x3F)) );
}
else
{
if( giop_minor == 2 && write_length_indicator )
{
buffer.write_byte( (byte) 2 ); //the chars length in bytes
}
buffer.write_byte( (byte)(0xC0 | ((c >> 6) & 0x1F)) );
buffer.write_byte( (byte)(0x80 | ((c >> 0) & 0x3F)) );
}
}
@Override
public void write_string( OutputBuffer buffer, String s, boolean write_bom, boolean write_length, int giop_minor )
{
try
{
byte[] bytes = s.getBytes(this.getName());
buffer.write_octet_array(bytes, 0, bytes.length);
}
catch (UnsupportedEncodingException e)
{
throw new CODESET_INCOMPATIBLE("Bad codeset: " + getName());
}
}
@Override
public int get_wstring_size( String s, int startPos, int currentPos )
{
return currentPos - startPos - 4;
}
}
static abstract private class TwoByteCodeSet extends CodeSet
{
TwoByteCodeSet( int id, String name )
{
super( id, name );
}
/**
* Returns true if the character type specified is 'wide'.
*/
@Override
public boolean supportsCharacterData( boolean wide )
{
return wide;
}
@Override
public char read_wchar( InputBuffer buffer, int giop_minor, boolean littleEndian )
{
if (littleEndian)
{
return (char) ((buffer.readByte() & 0xFF) | (buffer.readByte() << 8));
}
else
{
return (char) ((buffer.readByte() << 8) | (buffer.readByte() & 0xFF));
}
}
@Override
public String read_wstring( InputBuffer source, int lengthIndicator, int giop_minor, boolean little_endian )
{
if( giop_minor == 2 )
{
return readGiop12WString( source, lengthIndicator, giop_minor );
}
else //GIOP 1.1 / 1.0 : length indicates number of 2-byte characters
{
char buf[] = new char[lengthIndicator];
int endPos = source.get_pos() + 2* lengthIndicator;
int i = 0;
while( source.get_pos() < endPos )
{
buf[ i++ ] = read_wchar( source, giop_minor, little_endian );
}
if( (i != 0) && (buf[ i - 1 ] == 0) ) //don't return terminating NUL
{
return new String( buf, 0, i - 1 );
}
else //doesn't have a terminating NUL. This is actually not allowed
{
return new String( buf, 0, i );
}
}
}
@Override
public void write_char( OutputBuffer buffer, char c, boolean write_bom, boolean write_length_indicator, int giop_minor )
{
if (giop_minor < 2)
{
buffer.write_short( (short) c ); //UTF-16 char is treated as an ushort (write aligned)
}
else
{
if (write_length_indicator) //the chars length in bytes
{
buffer.write_byte( (byte) 2 );
}
if (write_bom) //big endian encoding
{
buffer.write_byte( (byte) 0xFE );
buffer.write_byte( (byte) 0xFF );
}
//write unaligned
buffer.write_byte( (byte)((c >> 8) & 0xFF) );
buffer.write_byte( (byte) (c & 0xFF) );
}
}
@Override
public int get_wstring_size( String s, int startPos, int currentPos )
{
return s.length() + 1; // size in chars (+ NUL char)
}
}
static private class Utf16CodeSet extends TwoByteCodeSet
{
private Utf16CodeSet()
{
super( 0x00010109, "UTF16" );
}
/** Returns the configured value to use BOMs only when specifically configured to do so. */
@Override
public boolean write_bom( boolean configuredForBom )
{
return false;
}
}
/*
* According to:
* http://www.omg.org/issues/issue4008.txt
* http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4506930
* UCS-2 should not write a BOM.
*/
static private class Ucs2CodeSet extends TwoByteCodeSet
{
private Ucs2CodeSet()
{
super( 0x00010100, "UCS2" );
}
}
}