/*********************************************************************************
* TotalCross Software Development Kit *
* Copyright (C) 2003-2004 Jaxo Systems - Pierre G. Richard *
* Copyright (C) 2003-2012 SuperWaba Ltda. *
* All Rights Reserved *
* *
* This library and virtual machine is distributed in the hope that it will *
* be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
* *
* This file is covered by the GNU LESSER GENERAL PUBLIC LICENSE VERSION 3.0 *
* A copy of this license is located in file license.txt at the root of this *
* SDK or can be downloaded here: *
* http://www.gnu.org/licenses/lgpl-3.0.txt *
* *
*********************************************************************************/
package totalcross.sys;
/**
* This class is used to correctly handle UTF8 byte to UCS-2 chracter conversions.
* <P>
* To use this class, you can call
* <pre>totalcross.sys.Convert.setDefaultConverter("UTF8");</pre>
*
* @see totalcross.sys.Convert#charConverter
* @see totalcross.sys.Convert#setDefaultConverter(String)
* @see totalcross.sys.CharacterConverter
* @see CharacterConverter
* @author Pierre G. Richard
*/
public class UTF8CharacterConverter extends CharacterConverter
{
/**
* Convert UTF-8 bytes to UCS-2 characters
*
* @param bytes byte array to convert
* @param start first byte to convert in the byte array
* @param length number of bytes to convert
* @return UCS-2 character array resulting from the conversion
*/
public char[] bytes2chars(byte bytes[], int start, int length)
{
int end = start + length;
int tgtOfs = 0;
char []chars = new char[length]; // upper bound
while (start < end)
{
int c0 = bytes[start++] & 0xFF;
if (c0 < 0x80) // if a 1 byte sequence,
{
chars[tgtOfs++] = (char)c0; // set the value
continue; // success.
}
if (start >= end) // If no byte follows,
{
chars[tgtOfs++] = '?'; // set MCS
break; // done
}
int c = (bytes[start++] & 0xFF) ^ 0x80; // 2nd byte
if ((c & 0xC0) != 0) // starts new sequence?
{
--start; // Yes, backup
chars[tgtOfs++] = '?'; // set MCS
continue; // pursue
}
int r = (c0 << 6) | c; // Get encoded value
if ((c0 & 0xE0) == 0xC0) // 2 bytes sequence?
{
chars[tgtOfs++] = (char)(r & 0x7FF); // Yes. Cut noise
continue; // pursue
}
if (start >= end) // If no byte follows,
{
chars[tgtOfs++] = '?'; // set MCS
break; // done
}
c = (bytes[start++] & 0xFF) ^ 0x80; // 3rd byte
if ((c & 0xC0) != 0) // starts new sequence?
{
--start; // Yes, backup
chars[tgtOfs++] = '?'; // set MCS
continue; // pursue
}
chars[tgtOfs++] = (char)((r << 6) | c); // Get encoded value
}
if (chars.length > tgtOfs) // too much room left
{
char[] temp = new char[tgtOfs]; // shrink to exact size
Vm.arrayCopy(chars, 0, temp, 0, tgtOfs);
chars = temp;
}
return chars;
}
/**
* Convert UCS-2 characters to UTF-8 bytes
*
* @param chars character array to convert
* @param start first character to convert in the character array
* @param length number of characters to convert
* @return UTF-8 byte array resulting from the conversion
*/
public byte[] chars2bytes(char chars[], int start, int length)
{
int tgtOfs = 0;
int end = start + length;
byte[] bytes = new byte[length+length+length]; // guich@566_5: worst case is all chars > 0x800, which leads to 3 x length
while (start < end)
{
int r = chars[start++];
if (r < 0x80) // 1 byte sequence
bytes[tgtOfs++] = (byte)r; // Yes: set the value
else
if (r < 0x800) // 2 bytes sequence?
{
bytes[tgtOfs++] = (byte)(0xC0 | (r >> 6));
bytes[tgtOfs++] = (byte)(0x80 | (r & 0x3F));
}
else // 3 bytes sequence.
{
bytes[tgtOfs++] = (byte)(0xE0 | (r >> 12));
bytes[tgtOfs++] = (byte)(0x80 | ((r >> 6) & 0x3F));
bytes[tgtOfs++] = (byte)(0x80 | (r & 0x3F));
}
}
if (bytes.length > tgtOfs) // too much room left
{
byte[] temp = new byte[tgtOfs]; // shrink to exact size
Vm.arrayCopy(bytes, 0, temp, 0, tgtOfs);
bytes = temp;
}
return bytes;
}
native public char[]bytes2chars4D(byte bytes[], int offset, int length);
native public byte[] chars2bytes4D(char chars[], int offset, int length);
}