package com.cloudhopper.commons.charset;
/*
* #%L
* ch-commons-charset
* %%
* Copyright (C) 2012 Cloudhopper by Twitter
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
// third party imports
import com.cloudhopper.commons.util.HexUtil;
import org.junit.Assert;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Arrays;
import java.util.Map;
/**
*
* @author joelauer (twitter: @jjlauer or <a href="http://twitter.com/jjlauer" target=window>http://twitter.com/jjlauer</a>)
*/
public class CharsetUtilTest {
private static final Logger logger = LoggerFactory.getLogger(CharsetUtilTest.class);
@Test
public void encode() throws Exception {
// euro currency symbol
String str0 = "\u20ac";
byte[] bytes = null;
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_GSM);
Assert.assertArrayEquals(HexUtil.toByteArray("1B65"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_PACKED_GSM);
Assert.assertArrayEquals(HexUtil.toByteArray("9B32"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UCS_2);
Assert.assertArrayEquals(HexUtil.toByteArray("20AC"), bytes);
Assert.assertArrayEquals(str0.getBytes("UTF-16BE"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UCS_2LE);
Assert.assertArrayEquals(HexUtil.toByteArray("AC20"), bytes);
Assert.assertArrayEquals(str0.getBytes("UTF-16LE"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UTF_8);
Assert.assertArrayEquals(HexUtil.toByteArray("E282AC"), bytes);
Assert.assertArrayEquals(str0.getBytes("UTF-8"), bytes);
// latin-1 doesn't contain the euro symbol - replace with '?'
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_1);
Assert.assertArrayEquals(HexUtil.toByteArray("3F"), bytes);
Assert.assertArrayEquals(str0.getBytes("ISO-8859-1"), bytes);
// latin-9 does contain the euro symbol
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_15);
Assert.assertArrayEquals(HexUtil.toByteArray("A4"), bytes);
Assert.assertArrayEquals(str0.getBytes("ISO-8859-15"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_AIRWIDE_IA5);
Assert.assertArrayEquals(HexUtil.toByteArray("1B65"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFD2_GSM);
Assert.assertArrayEquals(HexUtil.toByteArray("1B65"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFTR_GSM);
Assert.assertArrayEquals(HexUtil.toByteArray("1B65"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_TMOBILENL_GSM);
Assert.assertArrayEquals(HexUtil.toByteArray("80"), bytes);
// longer string with @ symbol in-between
str0 = "Hello @ World";
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_GSM);
//logger.debug(HexUtil.toHexString(bytes));
Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F200020576F726C64"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_PACKED_GSM);
//logger.debug(HexUtil.toHexString(bytes));
Assert.assertArrayEquals(HexUtil.toByteArray("C8329BFD060140D7B79C4D06"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UCS_2);
//logger.debug(HexUtil.toHexString(bytes));
Assert.assertArrayEquals(HexUtil.toByteArray("00480065006C006C006F0020004000200057006F0072006C0064"), bytes);
Assert.assertArrayEquals(str0.getBytes("UTF-16BE"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UCS_2LE);
//logger.debug(HexUtil.toHexString(bytes));
Assert.assertArrayEquals(HexUtil.toByteArray("480065006C006C006F0020004000200057006F0072006C006400"), bytes);
Assert.assertArrayEquals(str0.getBytes("UTF-16LE"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UTF_8);
//logger.debug(HexUtil.toHexString(bytes));
Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F204020576F726C64"), bytes);
Assert.assertArrayEquals(str0.getBytes("UTF-8"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_1);
Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F204020576F726C64"), bytes);
Assert.assertArrayEquals(str0.getBytes("ISO-8859-1"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_15);
Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F204020576F726C64"), bytes);
Assert.assertArrayEquals(str0.getBytes("ISO-8859-15"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_AIRWIDE_IA5);
Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F200020576F726C64"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFD2_GSM);
//logger.debug(HexUtil.toHexString(bytes));
Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F204020576F726C64"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFTR_GSM);
//logger.debug(HexUtil.toHexString(bytes));
Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F204020576F726C64"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_TMOBILENL_GSM);
//logger.debug(HexUtil.toHexString(bytes));
Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F200020576F726C64"), bytes);
// longer string with @ symbol in-between
str0 = "JoeyBlue";
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_GSM);
//logger.debug(HexUtil.toHexString(bytes));
Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_PACKED_GSM);
//logger.debug(HexUtil.toHexString(bytes));
Assert.assertArrayEquals(HexUtil.toByteArray("CA77392F64D7CB"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UCS_2);
//logger.debug(HexUtil.toHexString(bytes));
Assert.assertArrayEquals(HexUtil.toByteArray("004A006F006500790042006C00750065"), bytes);
Assert.assertArrayEquals(str0.getBytes("UTF-16BE"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UCS_2LE);
//logger.debug(HexUtil.toHexString(bytes));
Assert.assertArrayEquals(HexUtil.toByteArray("4A006F006500790042006C0075006500"), bytes);
Assert.assertArrayEquals(str0.getBytes("UTF-16LE"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UTF_8);
//logger.debug(HexUtil.toHexString(bytes));
Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes);
Assert.assertArrayEquals(str0.getBytes("UTF-8"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_1);
Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes);
Assert.assertArrayEquals(str0.getBytes("ISO-8859-1"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_15);
Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes);
Assert.assertArrayEquals(str0.getBytes("ISO-8859-15"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_AIRWIDE_IA5);
Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFD2_GSM);
Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFTR_GSM);
Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_TMOBILENL_GSM);
Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes);
// longer string with @ symbol in-between
str0 = "{}[]$";
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_GSM);
//logger.debug(HexUtil.toHexString(bytes));
Assert.assertArrayEquals(HexUtil.toByteArray("1B281B291B3C1B3E02"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_PACKED_GSM);
//logger.debug(HexUtil.toHexString(bytes));
Assert.assertArrayEquals(HexUtil.toByteArray("1BD426B5E16D7C02"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UCS_2);
//logger.debug(HexUtil.toHexString(bytes));
Assert.assertArrayEquals(HexUtil.toByteArray("007B007D005B005D0024"), bytes);
Assert.assertArrayEquals(str0.getBytes("UTF-16BE"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UCS_2LE);
//logger.debug(HexUtil.toHexString(bytes));
Assert.assertArrayEquals(HexUtil.toByteArray("7B007D005B005D002400"), bytes);
Assert.assertArrayEquals(str0.getBytes("UTF-16LE"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UTF_8);
//logger.debug(HexUtil.toHexString(bytes));
Assert.assertArrayEquals(HexUtil.toByteArray("7B7D5B5D24"), bytes);
Assert.assertArrayEquals(str0.getBytes("UTF-8"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_1);
Assert.assertArrayEquals(HexUtil.toByteArray("7B7D5B5D24"), bytes);
Assert.assertArrayEquals(str0.getBytes("ISO-8859-1"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_15);
Assert.assertArrayEquals(HexUtil.toByteArray("7B7D5B5D24"), bytes);
Assert.assertArrayEquals(str0.getBytes("ISO-8859-15"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_AIRWIDE_IA5);
Assert.assertArrayEquals(HexUtil.toByteArray("1B281B291B3C1B3E02"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFD2_GSM);
Assert.assertArrayEquals(HexUtil.toByteArray("1B281B291B3C1B3E24"), bytes);
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFTR_GSM);
Assert.assertArrayEquals(HexUtil.toByteArray("1B281B291B3C1B3E24"), bytes);
// {}[] not supported
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_TMOBILENL_GSM);
Assert.assertArrayEquals(HexUtil.toByteArray("3F3F3F3F02"), bytes);
// chars specifically to vodafone-turkey
//str0 = "$@£¤¥§ÄÅßñΓΔΘΩ€";
str0 = "$@\u00a3\u00a4\u00a5\u00a7\u00c4\u00c5\u00df\u00f1\u0393\u0394\u0398\u03a9\u20ac";
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFTR_GSM);
Assert.assertArrayEquals(HexUtil.toByteArray("2440A3A4A5A7C4C5DFF1137F19151B65"), bytes);
// form feed is an escape code in GSM
str0 = "\f\f";
bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_GSM);
Assert.assertArrayEquals(HexUtil.toByteArray("1B0A1B0A"), bytes);
}
@Test
public void decode() throws Exception {
// euro currency symbol
String str0 = "\u20ac";
String str1 = null;
str1 = CharsetUtil.decode(HexUtil.toByteArray("1B65"), CharsetUtil.CHARSET_GSM);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("9B32"), CharsetUtil.CHARSET_PACKED_GSM);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("20AC"), CharsetUtil.CHARSET_UCS_2);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("E282AC"), CharsetUtil.CHARSET_UTF_8);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("1B65"), CharsetUtil.CHARSET_AIRWIDE_IA5);
Assert.assertEquals(str0, str1);
// latin-1 doesn't contain the euro symbol - replace with '?'
//str1 = CharsetUtil.decode(HexUtil.toByteArray("3F"), CharsetUtil.CHARSET_ISO_8859_1);
//Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("A4"), CharsetUtil.CHARSET_ISO_8859_15);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("1B65"), CharsetUtil.CHARSET_VFD2_GSM);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("1B65"), CharsetUtil.CHARSET_VFTR_GSM);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("80"), CharsetUtil.CHARSET_TMOBILENL_GSM);
Assert.assertEquals(str0, str1);
// longer string with @ symbol in-between
str0 = "Hello @ World";
str1 = CharsetUtil.decode(HexUtil.toByteArray("48656C6C6F200020576F726C64"), CharsetUtil.CHARSET_GSM);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("C8329BFD060140D7B79C4D06"), CharsetUtil.CHARSET_PACKED_GSM);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("00480065006C006C006F0020004000200057006F0072006C0064"), CharsetUtil.CHARSET_UCS_2);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("48656C6C6F204020576F726C64"), CharsetUtil.CHARSET_UTF_8);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("48656C6C6F204020576F726C64"), CharsetUtil.CHARSET_ISO_8859_1);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("48656C6C6F204020576F726C64"), CharsetUtil.CHARSET_ISO_8859_15);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("48656C6C6F204020576F726C64"), CharsetUtil.CHARSET_AIRWIDE_IA5);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("48656C6C6F204020576F726C64"), CharsetUtil.CHARSET_VFD2_GSM);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("48656C6C6F204020576F726C64"), CharsetUtil.CHARSET_VFTR_GSM);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("48656C6C6F200020576F726C64"), CharsetUtil.CHARSET_TMOBILENL_GSM);
Assert.assertEquals(str0, str1);
// longer string with @ symbol in-between
str0 = "JoeyBlue";
str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_GSM);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("CA77392F64D7CB"), CharsetUtil.CHARSET_PACKED_GSM);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("004A006F006500790042006C00750065"), CharsetUtil.CHARSET_UCS_2);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_UTF_8);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_ISO_8859_1);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_ISO_8859_15);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_AIRWIDE_IA5);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_VFD2_GSM);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_VFTR_GSM);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_TMOBILENL_GSM);
Assert.assertEquals(str0, str1);
// longer string with @ symbol in-between
str0 = "{}[]$";
str1 = CharsetUtil.decode(HexUtil.toByteArray("1B281B291B3C1B3E02"), CharsetUtil.CHARSET_GSM);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("1BD426B5E16D7C02"), CharsetUtil.CHARSET_PACKED_GSM);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("007B007D005B005D0024"), CharsetUtil.CHARSET_UCS_2);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("7B7D5B5D24"), CharsetUtil.CHARSET_UTF_8);
Assert.assertEquals(str0, str1);
// airwide is close to GSM, $ is 0x24 rather than 0x02 though
str1 = CharsetUtil.decode(HexUtil.toByteArray("1B281B291B3C1B3E24"), CharsetUtil.CHARSET_AIRWIDE_IA5);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("7B7D5B5D24"), CharsetUtil.CHARSET_ISO_8859_1);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("7B7D5B5D24"), CharsetUtil.CHARSET_ISO_8859_15);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("1B281B291B3C1B3E24"), CharsetUtil.CHARSET_VFD2_GSM);
Assert.assertEquals(str0, str1);
str1 = CharsetUtil.decode(HexUtil.toByteArray("1B281B291B3C1B3E24"), CharsetUtil.CHARSET_VFTR_GSM);
Assert.assertEquals(str0, str1);
// skip TMOBILENL_GSM - can't encode {}[]
// had problem passing these tests on linux vs. mac os x -- issue with
// byte encoding on differnet platforms, replaced tests with source strings
// that use Java unicode escapes
// helpful URL: http://www.greywyvern.com/code/php/utf8_html
// decode a string with every char in VFD2-GSM
// str0 = "@$ߤ¡Ñܧñü_";
str0 = "@$\u00df\u00a4\u00a1\u00d1\u00dc\u00a7\u00f1\u00fc_";
str1 = CharsetUtil.decode(HexUtil.toByteArray("40247E02A15F5D5E1E7D11"), CharsetUtil.CHARSET_VFD2_GSM);
Assert.assertEquals(str0, str1);
//str0 = "@$ß$@ÑÜ_ñü_";
str0 = "@$\u00df$@\u00d1\u00dc_\u00f1\u00fc_";
str1 = CharsetUtil.decode(HexUtil.toByteArray("40241E24405D5E5F7D7E5F"), CharsetUtil.CHARSET_AIRWIDE_IA5);
Assert.assertEquals(str0, str1);
// chars specifically to vodafone-turkey
//str0 = "$@£¤¥§ÄÅßñΓΔΘΩ€";
str0 = "$@\u00a3\u00a4\u00a5\u00a7\u00c4\u00c5\u00df\u00f1\u0393\u0394\u0398\u03a9\u20ac";
str1 = CharsetUtil.decode(HexUtil.toByteArray("2440A3A4A5A7C4C5DFF1137F19151B65"), CharsetUtil.CHARSET_VFTR_GSM);
Assert.assertEquals(str0, str1);
// form feed GSM escape sequence
str0 = "\f\f";
str1 = CharsetUtil.decode(HexUtil.toByteArray("1B0A1B0A"), CharsetUtil.CHARSET_GSM);
Assert.assertEquals(str0, str1);
}
@Test
public void verifyDecodeDoesNotChangeByteArray() throws Exception {
for (Map.Entry<String,Charset> entry : CharsetUtil.getCharsetMap().entrySet()) {
byte[] bytes = new byte[] { (byte)0x40, (byte)0x5F, (byte)0x24, (byte)0x78, (byte)0x02, (byte)0x02};
byte[] expectedBytes = Arrays.copyOf(bytes, bytes.length);
String str0 = CharsetUtil.decode(bytes, entry.getValue());
// test that the byte array wasn't changed
Assert.assertArrayEquals("Charset " + entry.getKey() + " impl bad -- modified byte array parameter", expectedBytes, bytes);
}
}
@Test
public void verifyNullByteArray() throws Exception {
for (Map.Entry<String,Charset> entry : CharsetUtil.getCharsetMap().entrySet()) {
// test that the byte array wasn't changed
Assert.assertEquals("Charset " + entry.getKey() + " impl bad -- did not return null", null, CharsetUtil.decode(null, entry.getValue()));
}
}
@Test
public void decodeToStringBuilderAllCharsets() throws Exception {
// try every charset with simple A-Z, a-z, and 0-9 which should work in all charsets
String expectedString = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefjhijklmnopqrstuvwxyz01234567890";
// test decode to stringBuilder
for (Map.Entry<String,Charset> entry : CharsetUtil.getCharsetMap().entrySet()) {
// make this a harder test where we actually test this was appended!
byte[] expectedBytes = CharsetUtil.encode(expectedString, entry.getKey());
StringBuilder sb = new StringBuilder("T");
CharsetUtil.decode(expectedBytes, sb, entry.getValue());
Assert.assertEquals("Charset " + entry.getKey() + " impl broken", "T"+expectedString, sb.toString());
}
}
@Test
public void decodeToStringAllCharsets() throws Exception {
// try every charset with simple A-Z, a-z, and 0-9 which should work in all charsets
String expectedString = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefjhijklmnopqrstuvwxyz01234567890";
// test decode to stringBuilder
for (Map.Entry<String,Charset> entry : CharsetUtil.getCharsetMap().entrySet()) {
// make this a harder test where we actually test this was appended!
byte[] expectedBytes = CharsetUtil.encode(expectedString, entry.getValue());
String actualString = CharsetUtil.decode(expectedBytes, entry.getKey());
Assert.assertEquals("Charset " + entry.getKey() + " impl broken", expectedString, actualString);
}
}
@Test
public void normalize() throws Exception {
String in = null;
// try every charset with simple A-Z, a-z, and 0-9 which should work in all charsets
in = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefjhijklmnopqrstuvwxyz01234567890?&@";
for (Map.Entry<String,Charset> entry : CharsetUtil.getCharsetMap().entrySet()) {
Assert.assertEquals("Charset " + entry.getKey() + " implementation broken", in, CharsetUtil.normalize(in, entry.getValue()));
}
in = "\u20AC"; // euro currency char (only supported in a couple charsets)
Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_GSM));
Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_PACKED_GSM));
Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_AIRWIDE_GSM));
Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_VFD2_GSM));
Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_VFTR_GSM));
Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_ISO_8859_1));
Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_ISO_8859_15));
Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_UCS_2));
Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_UCS_2LE));
Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_UTF_8));
Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_TMOBILENL_GSM));
in = "\u6025"; // arabic char (only supported in a couple charsets)
Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_GSM));
Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_PACKED_GSM));
Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_AIRWIDE_GSM));
Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_VFD2_GSM));
Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_VFTR_GSM));
Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_ISO_8859_1));
Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_ISO_8859_15));
Assert.assertEquals("\u6025", CharsetUtil.normalize(in, CharsetUtil.CHARSET_UCS_2));
Assert.assertEquals("\u6025", CharsetUtil.normalize(in, CharsetUtil.CHARSET_UCS_2LE));
Assert.assertEquals("\u6025", CharsetUtil.normalize(in, CharsetUtil.CHARSET_UTF_8));
Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_TMOBILENL_GSM));
}
}