/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.io; import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.IOException; import java.io.UTFDataFormatException; import java.nio.ByteBuffer; import java.util.Random; import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.util.StringUtils; import org.junit.Test; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; /** Unit tests for UTF8. */ @SuppressWarnings("deprecation") public class TestUTF8 { private static final Random RANDOM = new Random(); public static String getTestString() throws Exception { StringBuilder buffer = new StringBuilder(); int length = RANDOM.nextInt(100); for (int i = 0; i < length; i++) { buffer.append((char)(RANDOM.nextInt(Character.MAX_VALUE))); } return buffer.toString(); } @Test public void testWritable() throws Exception { for (int i = 0; i < 10000; i++) { TestWritable.testWritable(new UTF8(getTestString())); } } @Test public void testGetBytes() throws Exception { for (int i = 0; i < 10000; i++) { // generate a random string String before = getTestString(); // Check that the bytes are stored correctly in Modified-UTF8 format. // Note that the DataInput and DataOutput interfaces convert between // bytes and Strings using the Modified-UTF8 format. assertEquals(before, readModifiedUTF(UTF8.getBytes(before))); } } private String readModifiedUTF(byte[] bytes) throws IOException { final short lengthBytes = (short)2; ByteBuffer bb = ByteBuffer.allocate(bytes.length + lengthBytes); bb.putShort((short)bytes.length).put(bytes); ByteArrayInputStream bis = new ByteArrayInputStream(bb.array()); DataInputStream dis = new DataInputStream(bis); return dis.readUTF(); } @Test public void testIO() throws Exception { DataOutputBuffer out = new DataOutputBuffer(); DataInputBuffer in = new DataInputBuffer(); for (int i = 0; i < 10000; i++) { // generate a random string String before = getTestString(); // write it out.reset(); UTF8.writeString(out, before); // test that it reads correctly in.reset(out.getData(), out.getLength()); String after = UTF8.readString(in); assertEquals(before, after); // test that it reads correctly with DataInput in.reset(out.getData(), out.getLength()); String after2 = in.readUTF(); assertEquals(before, after2); } } @Test public void testNullEncoding() throws Exception { String s = new String(new char[] { 0 }); DataOutputBuffer dob = new DataOutputBuffer(); new UTF8(s).write(dob); assertEquals(s, new String(dob.getData(), 2, dob.getLength()-2, "UTF-8")); } /** * Test encoding and decoding of UTF8 outside the basic multilingual plane. * * This is a regression test for HADOOP-9103. */ @Test public void testNonBasicMultilingualPlane() throws Exception { // Test using the "CAT FACE" character (U+1F431) // See http://www.fileformat.info/info/unicode/char/1f431/index.htm String catFace = "\uD83D\uDC31"; // This encodes to 4 bytes in UTF-8: byte[] encoded = catFace.getBytes("UTF-8"); assertEquals(4, encoded.length); assertEquals("f09f90b1", StringUtils.byteToHexString(encoded)); // Decode back to String using our own decoder String roundTrip = UTF8.fromBytes(encoded); assertEquals(catFace, roundTrip); } /** * Test that decoding invalid UTF8 throws an appropriate error message. */ @Test public void testInvalidUTF8() throws Exception { byte[] invalid = new byte[] { 0x01, 0x02, (byte)0xff, (byte)0xff, 0x01, 0x02, 0x03, 0x04, 0x05 }; try { UTF8.fromBytes(invalid); fail("did not throw an exception"); } catch (UTFDataFormatException utfde) { GenericTestUtils.assertExceptionContains( "Invalid UTF8 at ffff01020304", utfde); } } /** * Test for a 5-byte UTF8 sequence, which is now considered illegal. */ @Test public void test5ByteUtf8Sequence() throws Exception { byte[] invalid = new byte[] { 0x01, 0x02, (byte)0xf8, (byte)0x88, (byte)0x80, (byte)0x80, (byte)0x80, 0x04, 0x05 }; try { UTF8.fromBytes(invalid); fail("did not throw an exception"); } catch (UTFDataFormatException utfde) { GenericTestUtils.assertExceptionContains( "Invalid UTF8 at f88880808004", utfde); } } /** * Test that decoding invalid UTF8 due to truncation yields the correct * exception type. */ @Test public void testInvalidUTF8Truncated() throws Exception { // Truncated CAT FACE character -- this is a 4-byte sequence, but we // only have the first three bytes. byte[] truncated = new byte[] { (byte)0xF0, (byte)0x9F, (byte)0x90 }; try { UTF8.fromBytes(truncated); fail("did not throw an exception"); } catch (UTFDataFormatException utfde) { GenericTestUtils.assertExceptionContains( "Truncated UTF8 at f09f90", utfde); } } }