/**
* Licensed to Cloudera, Inc. under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Cloudera, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.util;
import java.nio.charset.Charset;
import java.util.Arrays;
import org.junit.Assert;
import org.junit.Test;
/**
* Experiments to see how different charsets work.
*/
public class TestCharset {
// Wow, hudson defaults to US-ASCII!
@Test
public void testDefaultCharset() {
System.out.println("Default: " + Charset.defaultCharset());
String charset = Charset.defaultCharset().displayName();
System.out.println(charset);
// Ubuntu: assertEquals("UTF-8", charset); // ubuntu
// Hudson : assertEquals("US-ASCII", charset); // hudson
}
@Test
public void testDumpCharsets() {
System.out.println("== charsets");
for (String s : Charset.availableCharsets().keySet()) {
System.out.println(s);
}
}
public byte[] allbytes() {
byte[] b = new byte[256];
for (int i = 0; i < 256; i++) {
b[i] = (byte) i;
}
return b;
}
public String dumpHex(byte[] b) {
StringBuilder s = new StringBuilder();
for (int i = 0; i < b.length; i++) {
s.append(String.format("%02x ", b[i]));
}
return s.toString();
}
/**
* Basically, ISO-8859-1 is "raw" -- to and from, one byte to one char
*/
@Test
public void testDefaultStringCharset() {
byte[] all = allbytes();
String auto = new String(all);
String utf8 = new String(all, Charset.forName("UTF-8"));
String ascii = new String(all, Charset.forName("US-ASCII"));
String latin1 = new String(all, Charset.forName("ISO-8859-1"));
System.out.printf("lengths: auto: %d utf8: %d ascii: %d latin1: %d\n", auto
.length(), utf8.length(), ascii.length(), latin1.length());
System.out.printf("bytelen: auto: %d utf8: %d ascii: %d latin1: %d\n", auto
.getBytes().length, utf8.getBytes(Charset.forName("UTF-8")).length,
ascii.getBytes(Charset.forName("US-ASCII")).length, latin1
.getBytes(Charset.forName("ISO-8859-1")).length);
System.out.printf("original : %s\n", dumpHex(all));
System.out.printf("auto : %s\n", dumpHex(auto.getBytes()));
System.out.printf("utf-8 : %s\n", dumpHex(utf8.getBytes(Charset
.forName("UTF-8"))));
System.out.printf("ascii : %s\n", dumpHex(ascii.getBytes(Charset
.forName("US-ASCII"))));
System.out.printf("latin1 : %s\n", dumpHex(latin1.getBytes(Charset
.forName("ISO-8859-1"))));
Assert.assertTrue(Arrays.equals(all, latin1
.getBytes(Charset.forName("ISO-8859-1"))));
}
}