/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.io; import junit.framework.TestCase; import java.io.IOException; import java.io.UTFDataFormatException; import java.util.Random; import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.util.StringUtils; /** Unit tests for UTF8. */ @SuppressWarnings("deprecation") public class TestUTF8 extends TestCase { public TestUTF8(String name) { super(name); } private static final Random RANDOM = new Random(); public static String getTestString() throws Exception { StringBuilder buffer = new StringBuilder(); int length = RANDOM.nextInt(100); for (int i = 0; i < length; i++) { buffer.append((char)(RANDOM.nextInt(Character.MAX_VALUE))); } return buffer.toString(); } public void testWritable() throws Exception { for (int i = 0; i < 10000; i++) { TestWritable.testWritable(new UTF8(getTestString())); } } public void testGetBytes() throws Exception { for (int i = 0; i < 10000; i++) { // generate a random string String before = getTestString(); // check its utf8 assertEquals(before, new String(UTF8.getBytes(before), "UTF-8")); } } public void testIO() throws Exception { DataOutputBuffer out = new DataOutputBuffer(); DataInputBuffer in = new DataInputBuffer(); for (int i = 0; i < 10000; i++) { // generate a random string String before = getTestString(); // write it out.reset(); UTF8.writeString(out, before); // test that it reads correctly in.reset(out.getData(), out.getLength()); String after = UTF8.readString(in); assertEquals(before, after); // test that it reads correctly with DataInput in.reset(out.getData(), out.getLength()); String after2 = in.readUTF(); assertEquals(before, after2); // test that it is compatible with Java's other decoder String after3 = new String(out.getData(), 2, out.getLength()-2, "UTF-8"); assertEquals(before, after3); } } public void testNullEncoding() throws Exception { String s = new String(new char[] { 0 }); DataOutputBuffer dob = new DataOutputBuffer(); new UTF8(s).write(dob); assertEquals(s, new String(dob.getData(), 2, dob.getLength()-2, "UTF-8")); } /** * Test encoding and decoding of UTF8 outside the basic multilingual plane. * * This is a regression test for HADOOP-9103. */ public void testNonBasicMultilingualPlane() throws Exception { // Test using the "CAT FACE" character (U+1F431) // See http://www.fileformat.info/info/unicode/char/1f431/index.htm String catFace = "\uD83D\uDC31"; // This encodes to 4 bytes in UTF-8: byte[] encoded = catFace.getBytes("UTF-8"); assertEquals(4, encoded.length); assertEquals("f09f90b1", StringUtils.byteToHexString(encoded)); // Decode back to String using our own decoder String roundTrip = UTF8.fromBytes(encoded); assertEquals(catFace, roundTrip); } /** * Test that decoding invalid UTF8 throws an appropriate error message. */ public void testInvalidUTF8() throws Exception { byte[] invalid = new byte[] { 0x01, 0x02, (byte)0xff, (byte)0xff, 0x01, 0x02, 0x03, 0x04, 0x05 }; try { UTF8.fromBytes(invalid); fail("did not throw an exception"); } catch (UTFDataFormatException utfde) { GenericTestUtils.assertExceptionContains( "Invalid UTF8 at ffff01020304", utfde); } } /** * Test for a 5-byte UTF8 sequence, which is now considered illegal. */ public void test5ByteUtf8Sequence() throws Exception { byte[] invalid = new byte[] { 0x01, 0x02, (byte)0xf8, (byte)0x88, (byte)0x80, (byte)0x80, (byte)0x80, 0x04, 0x05 }; try { UTF8.fromBytes(invalid); fail("did not throw an exception"); } catch (UTFDataFormatException utfde) { GenericTestUtils.assertExceptionContains( "Invalid UTF8 at f88880808004", utfde); } } /** * Test that decoding invalid UTF8 due to truncation yields the correct * exception type. */ public void testInvalidUTF8Truncated() throws Exception { // Truncated CAT FACE character -- this is a 4-byte sequence, but we // only have the first three bytes. byte[] truncated = new byte[] { (byte)0xF0, (byte)0x9F, (byte)0x90 }; try { UTF8.fromBytes(truncated); fail("did not throw an exception"); } catch (UTFDataFormatException utfde) { GenericTestUtils.assertExceptionContains( "Truncated UTF8 at f09f90", utfde); } } }