/* * Copyright (C) 2013 The Guava Authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.common.base; import static com.google.common.truth.Truth.assertThat; import static java.lang.Character.MAX_CODE_POINT; import static java.lang.Character.MAX_HIGH_SURROGATE; import static java.lang.Character.MAX_LOW_SURROGATE; import static java.lang.Character.MIN_HIGH_SURROGATE; import static java.lang.Character.MIN_LOW_SURROGATE; import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT; import com.google.common.annotations.GwtCompatible; import com.google.common.annotations.GwtIncompatible; import com.google.common.collect.ImmutableList; import java.util.Arrays; import java.util.HashMap; import java.util.Random; import junit.framework.TestCase; /** * Unit tests for {@link Utf8}. * * @author Jon Perlow * @author Martin Buchholz * @author Clément Roux */ @GwtCompatible(emulated = true) public class Utf8Test extends TestCase { private static final ImmutableList<String> ILL_FORMED_STRINGS; static { ImmutableList.Builder<String> builder = ImmutableList.builder(); char[] surrogates = { MAX_LOW_SURROGATE, MAX_HIGH_SURROGATE, MIN_LOW_SURROGATE, MIN_HIGH_SURROGATE, }; for (char surrogate : surrogates) { builder.add(newString(surrogate)); builder.add(newString(surrogate, 'n')); builder.add(newString('n', surrogate)); builder.add(newString(surrogate, surrogate)); } builder.add(newString(MIN_LOW_SURROGATE, MAX_HIGH_SURROGATE)); ILL_FORMED_STRINGS = builder.build(); } public void testEncodedLength_validStrings() { assertEquals(0, Utf8.encodedLength("")); assertEquals(11, Utf8.encodedLength("Hello world")); assertEquals(8, Utf8.encodedLength("Résumé")); assertEquals(461, Utf8.encodedLength("威廉·莎士比亞(William Shakespeare," + "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人," + "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、" + "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、" + "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響," + "哈都拕人翻譯做好多話。")); // A surrogate pair assertEquals(4, Utf8.encodedLength(newString(MIN_HIGH_SURROGATE, MIN_LOW_SURROGATE))); } public void testEncodedLength_validStrings2() { HashMap<Integer, Integer> utf8Lengths = new HashMap<Integer, Integer>(); utf8Lengths.put(0x00, 1); utf8Lengths.put(0x7f, 1); utf8Lengths.put(0x80, 2); utf8Lengths.put(0x7ff, 2); utf8Lengths.put(0x800, 3); utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT - 1, 3); utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT, 4); utf8Lengths.put(MAX_CODE_POINT, 4); Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[]{}); StringBuilder sb = new StringBuilder(); Random rnd = new Random(); for (int trial = 0; trial < 100; trial++) { sb.setLength(0); int utf8Length = 0; for (int i = 0; i < 6; i++) { Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)]; sb.appendCodePoint(randomCodePoint); utf8Length += utf8Lengths.get(randomCodePoint); if (utf8Length != Utf8.encodedLength(sb)) { StringBuilder repro = new StringBuilder(); for (int j = 0; j < sb.length(); j++) { repro.append(" " + (int) sb.charAt(j)); // GWT compatible } assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb)); } } } } public void testEncodedLength_invalidStrings() { testEncodedLengthFails(newString(MIN_HIGH_SURROGATE), 0); testEncodedLengthFails("foobar" + newString(MIN_HIGH_SURROGATE), 6); testEncodedLengthFails(newString(MIN_LOW_SURROGATE), 0); testEncodedLengthFails("foobar" + newString(MIN_LOW_SURROGATE), 6); testEncodedLengthFails(newString(MIN_HIGH_SURROGATE, MIN_HIGH_SURROGATE), 0); } private static void testEncodedLengthFails(String invalidString, int invalidCodePointIndex) { try { Utf8.encodedLength(invalidString); fail(); } catch (IllegalArgumentException expected) { assertThat(expected).hasMessage("Unpaired surrogate at index " + invalidCodePointIndex); } } // 128 - [chars 0x0000 to 0x007f] private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1; // 128 private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT = ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS; // 1920 [chars 0x0080 to 0x07FF] private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1; // 18,304 private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT = // Both bytes are one byte characters (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) + // The possible number of two byte characters TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS; // 2048 private static final long THREE_BYTE_SURROGATES = 2 * 1024; // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates] private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES; // 2,650,112 private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT = // All one byte characters (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) + // One two byte character and a one byte character 2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS + // Three byte characters THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS; // 1,048,576 [chars 0x10000L to 0x10FFFF] private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1; // 289,571,839 private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT = // All one byte characters (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) + // One and three byte characters 2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS + // Two two byte characters TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS + // Permutations of one and two byte characters 3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS + // Four byte characters FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS; /** Tests that round tripping of all two byte permutations work. */ @GwtIncompatible // java.nio.charset.Charset public void testIsWellFormed_1Byte() { testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT); } /** Tests that round tripping of all two byte permutations work. */ @GwtIncompatible // java.nio.charset.Charset public void testIsWellFormed_2Bytes() { testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT); } /** Tests that round tripping of all three byte permutations work. */ @GwtIncompatible // java.nio.charset.Charset public void testIsWellFormed_3Bytes() { testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT); } /** * Tests that round tripping of a sample of four byte permutations work. * All permutations are prohibitively expensive to test for automated runs. * This method tests specific four-byte cases. */ public void testIsWellFormed_4BytesSamples() { // Valid 4 byte. assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2); // Bad trailing bytes assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F); assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0); // Special cases for byte2 assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2); assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2); } /** Tests some hard-coded test cases. */ public void testSomeSequences() { // Empty assertWellFormed(); // One-byte characters, including control characters assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f" // Two-byte characters assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2" // Three-byte characters assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac" // Four-byte characters // "\u024B62\u024B62" assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32); // Mixed string // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62" assertWellFormed(0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30, 0x32, 0x34, 0x42, 0x36, 0x32, 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63, 0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32); // Not a valid string assertNotWellFormed(-1, 0, -1, 0); } public void testShardsHaveExpectedRoundTrippables() { // A sanity check. long actual = 0; for (long expected : generateFourByteShardsExpectedRunnables()) { actual += expected; } assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual); } private static String newString(char... chars) { return new String(chars); } private static byte[] toByteArray(int... bytes) { byte[] realBytes = new byte[bytes.length]; for (int i = 0; i < bytes.length; i++) { realBytes[i] = (byte) bytes[i]; } return realBytes; } private static void assertWellFormed(int... bytes) { assertTrue(Utf8.isWellFormed(toByteArray(bytes))); } private static void assertNotWellFormed(int... bytes) { assertFalse(Utf8.isWellFormed(toByteArray(bytes))); } private static long[] generateFourByteShardsExpectedRunnables() { long[] expected = new long[128]; // 0-63 are all 5300224 for (int i = 0; i <= 63; i++) { expected[i] = 5300224; } // 97-111 are all 2342912 for (int i = 97; i <= 111; i++) { expected[i] = 2342912; } // 113-117 are all 1048576 for (int i = 113; i <= 117; i++) { expected[i] = 1048576; } // One offs expected[112] = 786432; expected[118] = 786432; expected[119] = 1048576; expected[120] = 458752; expected[121] = 524288; expected[122] = 65536; // Anything not assigned was the default 0. return expected; } /** * Helper to run the loop to test all the permutations for the number of bytes * specified. * * @param numBytes the number of bytes in the byte array * @param expectedCount the expected number of roundtrippable permutations */ @GwtIncompatible // java.nio.charset.Charset private static void testBytes(int numBytes, long expectedCount) { testBytes(numBytes, expectedCount, 0, -1); } /** * Helper to run the loop to test all the permutations for the number of bytes * specified. This overload is useful for debugging to get the loop to start * at a certain character. * * @param numBytes the number of bytes in the byte array * @param expectedCount the expected number of roundtrippable permutations * @param start the starting bytes encoded as a long as big-endian * @param lim the limit of bytes to process encoded as a long as big-endian, * or -1 to mean the max limit for numBytes */ @GwtIncompatible // java.nio.charset.Charset private static void testBytes(int numBytes, long expectedCount, long start, long lim) { byte[] bytes = new byte[numBytes]; if (lim == -1) { lim = 1L << (numBytes * 8); } long countRoundTripped = 0; for (long byteChar = start; byteChar < lim; byteChar++) { long tmpByteChar = byteChar; for (int i = 0; i < numBytes; i++) { bytes[bytes.length - i - 1] = (byte) tmpByteChar; tmpByteChar = tmpByteChar >> 8; } boolean isRoundTrippable = Utf8.isWellFormed(bytes); assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes)); String s = new String(bytes, Charsets.UTF_8); byte[] bytesReencoded = s.getBytes(Charsets.UTF_8); boolean bytesEqual = Arrays.equals(bytes, bytesReencoded); if (bytesEqual != isRoundTrippable) { fail(); } if (isRoundTrippable) { countRoundTripped++; } } assertEquals(expectedCount, countRoundTripped); } }