/** * Copyright (C) 2011 Twitter, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. */ package com.cloudhopper.commons.charset; // third party imports import com.cloudhopper.commons.util.HexUtil; import java.util.Arrays; import java.util.Map; import org.junit.*; import org.apache.log4j.Logger; /** * * @author joelauer */ public class CharsetUtilTest { private static final Logger logger = Logger.getLogger(CharsetUtilTest.class); @Test public void encode() throws Exception { // euro currency symbol String str0 = "\u20ac"; byte[] bytes = null; bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_GSM); Assert.assertArrayEquals(HexUtil.toByteArray("1B65"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_PACKED_GSM); Assert.assertArrayEquals(HexUtil.toByteArray("9B32"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UCS_2); Assert.assertArrayEquals(HexUtil.toByteArray("20AC"), bytes); Assert.assertArrayEquals(str0.getBytes("UTF-16BE"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UTF_8); Assert.assertArrayEquals(HexUtil.toByteArray("E282AC"), bytes); Assert.assertArrayEquals(str0.getBytes("UTF-8"), bytes); // latin-1 doesn't contain the euro symbol - replace with '?' bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_1); Assert.assertArrayEquals(HexUtil.toByteArray("3F"), bytes); Assert.assertArrayEquals(str0.getBytes("ISO-8859-1"), bytes); // latin-9 does contain the euro symbol bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_15); Assert.assertArrayEquals(HexUtil.toByteArray("A4"), bytes); Assert.assertArrayEquals(str0.getBytes("ISO-8859-15"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_AIRWIDE_IA5); Assert.assertArrayEquals(HexUtil.toByteArray("1B65"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFD2_GSM); Assert.assertArrayEquals(HexUtil.toByteArray("1B65"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFTR_GSM); Assert.assertArrayEquals(HexUtil.toByteArray("1B65"), bytes); // longer string with @ symbol in-between str0 = "Hello @ World"; bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_GSM); //logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F200020576F726C64"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_PACKED_GSM); //logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("C8329BFD060140D7B79C4D06"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UCS_2); //logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("00480065006C006C006F0020004000200057006F0072006C0064"), bytes); Assert.assertArrayEquals(str0.getBytes("UTF-16BE"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UTF_8); //logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F204020576F726C64"), bytes); Assert.assertArrayEquals(str0.getBytes("UTF-8"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_1); Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F204020576F726C64"), bytes); Assert.assertArrayEquals(str0.getBytes("ISO-8859-1"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_15); Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F204020576F726C64"), bytes); Assert.assertArrayEquals(str0.getBytes("ISO-8859-15"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_AIRWIDE_IA5); Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F200020576F726C64"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFD2_GSM); //logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F204020576F726C64"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFTR_GSM); //logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F204020576F726C64"), bytes); // longer string with @ symbol in-between str0 = "JoeyBlue"; bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_GSM); //logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_PACKED_GSM); //logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("CA77392F64D7CB"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UCS_2); //logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("004A006F006500790042006C00750065"), bytes); Assert.assertArrayEquals(str0.getBytes("UTF-16BE"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UTF_8); //logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes); Assert.assertArrayEquals(str0.getBytes("UTF-8"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_1); Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes); Assert.assertArrayEquals(str0.getBytes("ISO-8859-1"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_15); Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes); Assert.assertArrayEquals(str0.getBytes("ISO-8859-15"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_AIRWIDE_IA5); Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFD2_GSM); Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFTR_GSM); Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes); // longer string with @ symbol in-between str0 = "{}[]$"; bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_GSM); //logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("1B281B291B3C1B3E02"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_PACKED_GSM); //logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("1BD426B5E16D7C02"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UCS_2); //logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("007B007D005B005D0024"), bytes); Assert.assertArrayEquals(str0.getBytes("UTF-16BE"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UTF_8); //logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("7B7D5B5D24"), bytes); Assert.assertArrayEquals(str0.getBytes("UTF-8"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_1); Assert.assertArrayEquals(HexUtil.toByteArray("7B7D5B5D24"), bytes); Assert.assertArrayEquals(str0.getBytes("ISO-8859-1"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_15); Assert.assertArrayEquals(HexUtil.toByteArray("7B7D5B5D24"), bytes); Assert.assertArrayEquals(str0.getBytes("ISO-8859-15"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_AIRWIDE_IA5); Assert.assertArrayEquals(HexUtil.toByteArray("1B281B291B3C1B3E02"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFD2_GSM); Assert.assertArrayEquals(HexUtil.toByteArray("1B281B291B3C1B3E24"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFTR_GSM); Assert.assertArrayEquals(HexUtil.toByteArray("1B281B291B3C1B3E24"), bytes); // chars specifically to vodafone-turkey //str0 = "$@£¤¥§ÄÅßñΓΔΘΩ€"; str0 = "$@\u00a3\u00a4\u00a5\u00a7\u00c4\u00c5\u00df\u00f1\u0393\u0394\u0398\u03a9\u20ac"; bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFTR_GSM); Assert.assertArrayEquals(HexUtil.toByteArray("2440A3A4A5A7C4C5DFF1137F19151B65"), bytes); } @Test public void decode() throws Exception { // euro currency symbol String str0 = "\u20ac"; String str1 = null; str1 = CharsetUtil.decode(HexUtil.toByteArray("1B65"), CharsetUtil.CHARSET_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("9B32"), CharsetUtil.CHARSET_PACKED_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("20AC"), CharsetUtil.CHARSET_UCS_2); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("E282AC"), CharsetUtil.CHARSET_UTF_8); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("1B65"), CharsetUtil.CHARSET_AIRWIDE_IA5); Assert.assertEquals(str0, str1); // latin-1 doesn't contain the euro symbol - replace with '?' //str1 = CharsetUtil.decode(HexUtil.toByteArray("3F"), CharsetUtil.CHARSET_ISO_8859_1); //Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("A4"), CharsetUtil.CHARSET_ISO_8859_15); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("1B65"), CharsetUtil.CHARSET_VFD2_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("1B65"), CharsetUtil.CHARSET_VFTR_GSM); Assert.assertEquals(str0, str1); // longer string with @ symbol in-between str0 = "Hello @ World"; str1 = CharsetUtil.decode(HexUtil.toByteArray("48656C6C6F200020576F726C64"), CharsetUtil.CHARSET_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("C8329BFD060140D7B79C4D06"), CharsetUtil.CHARSET_PACKED_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("00480065006C006C006F0020004000200057006F0072006C0064"), CharsetUtil.CHARSET_UCS_2); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("48656C6C6F204020576F726C64"), CharsetUtil.CHARSET_UTF_8); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("48656C6C6F204020576F726C64"), CharsetUtil.CHARSET_ISO_8859_1); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("48656C6C6F204020576F726C64"), CharsetUtil.CHARSET_ISO_8859_15); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("48656C6C6F204020576F726C64"), CharsetUtil.CHARSET_AIRWIDE_IA5); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("48656C6C6F204020576F726C64"), CharsetUtil.CHARSET_VFD2_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("48656C6C6F204020576F726C64"), CharsetUtil.CHARSET_VFTR_GSM); Assert.assertEquals(str0, str1); // longer string with @ symbol in-between str0 = "JoeyBlue"; str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("CA77392F64D7CB"), CharsetUtil.CHARSET_PACKED_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("004A006F006500790042006C00750065"), CharsetUtil.CHARSET_UCS_2); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_UTF_8); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_ISO_8859_1); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_ISO_8859_15); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_AIRWIDE_IA5); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_VFD2_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_VFTR_GSM); Assert.assertEquals(str0, str1); // longer string with @ symbol in-between str0 = "{}[]$"; str1 = CharsetUtil.decode(HexUtil.toByteArray("1B281B291B3C1B3E02"), CharsetUtil.CHARSET_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("1BD426B5E16D7C02"), CharsetUtil.CHARSET_PACKED_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("007B007D005B005D0024"), CharsetUtil.CHARSET_UCS_2); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("7B7D5B5D24"), CharsetUtil.CHARSET_UTF_8); Assert.assertEquals(str0, str1); // airwide is close to GSM, $ is 0x24 rather than 0x02 though str1 = CharsetUtil.decode(HexUtil.toByteArray("1B281B291B3C1B3E24"), CharsetUtil.CHARSET_AIRWIDE_IA5); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("7B7D5B5D24"), CharsetUtil.CHARSET_ISO_8859_1); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("7B7D5B5D24"), CharsetUtil.CHARSET_ISO_8859_15); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("1B281B291B3C1B3E24"), CharsetUtil.CHARSET_VFD2_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("1B281B291B3C1B3E24"), CharsetUtil.CHARSET_VFTR_GSM); Assert.assertEquals(str0, str1); // had problem passing these tests on linux vs. mac os x -- issue with // byte encoding on differnet platforms, replaced tests with source strings // that use Java unicode escapes // helpful URL: http://www.greywyvern.com/code/php/utf8_html // decode a string with every char in VFD2-GSM // str0 = "@$ߤ¡Ñܧñü_"; str0 = "@$\u00df\u00a4\u00a1\u00d1\u00dc\u00a7\u00f1\u00fc_"; str1 = CharsetUtil.decode(HexUtil.toByteArray("40247E02A15F5D5E1E7D11"), CharsetUtil.CHARSET_VFD2_GSM); Assert.assertEquals(str0, str1); //str0 = "@$ß$@ÑÜ_ñü_"; str0 = "@$\u00df$@\u00d1\u00dc_\u00f1\u00fc_"; str1 = CharsetUtil.decode(HexUtil.toByteArray("40241E24405D5E5F7D7E5F"), CharsetUtil.CHARSET_AIRWIDE_IA5); Assert.assertEquals(str0, str1); // chars specifically to vodafone-turkey //str0 = "$@£¤¥§ÄÅßñΓΔΘΩ€"; str0 = "$@\u00a3\u00a4\u00a5\u00a7\u00c4\u00c5\u00df\u00f1\u0393\u0394\u0398\u03a9\u20ac"; str1 = CharsetUtil.decode(HexUtil.toByteArray("2440A3A4A5A7C4C5DFF1137F19151B65"), CharsetUtil.CHARSET_VFTR_GSM); Assert.assertEquals(str0, str1); } @Test public void verifyDecodeDoesNotChangeByteArray() throws Exception { for (Map.Entry<String,Charset> entry : CharsetUtil.getCharsetMap().entrySet()) { byte[] bytes = new byte[] { (byte)0x40, (byte)0x5F, (byte)0x24, (byte)0x78, (byte)0x02 }; byte[] expectedBytes = Arrays.copyOf(bytes, bytes.length); String str0 = CharsetUtil.decode(bytes, entry.getValue()); // test that the byte array wasn't changed Assert.assertArrayEquals("Charset " + entry.getKey() + " impl bad -- modified byte array parameter", expectedBytes, bytes); } } @Test public void verifyNullByteArray() throws Exception { for (Map.Entry<String,Charset> entry : CharsetUtil.getCharsetMap().entrySet()) { // test that the byte array wasn't changed Assert.assertEquals("Charset " + entry.getKey() + " impl bad -- did not return null", "", CharsetUtil.decode(null, entry.getValue())); } } @Test public void normalize() throws Exception { String in = null; // try every charset with simple A-Z, a-z, and 0-9 which should work // in all charsets in = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefjhijklmnopqrstuvwxyz01234567890?&@"; for (Map.Entry<String,Charset> entry : CharsetUtil.getCharsetMap().entrySet()) { Assert.assertEquals("Charset " + entry.getKey() + " implementation broken", in, CharsetUtil.normalize(in, entry.getValue())); } in = "\u20AC"; // euro currency char (only supported in a couple charsets) Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_GSM)); Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_PACKED_GSM)); Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_AIRWIDE_GSM)); Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_VFD2_GSM)); Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_VFTR_GSM)); Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_ISO_8859_1)); Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_ISO_8859_15)); Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_UCS_2)); Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_UTF_8)); in = "\u6025"; // arabic char (only supported in a couple charsets) Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_GSM)); Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_PACKED_GSM)); Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_AIRWIDE_GSM)); Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_VFD2_GSM)); Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_VFTR_GSM)); Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_ISO_8859_1)); Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_ISO_8859_15)); Assert.assertEquals("\u6025", CharsetUtil.normalize(in, CharsetUtil.CHARSET_UCS_2)); Assert.assertEquals("\u6025", CharsetUtil.normalize(in, CharsetUtil.CHARSET_UTF_8)); } }