/*
* Copyright (C) 2014 Square, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package okio;
import java.io.EOFException;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
public final class Utf8Test {
@Test public void oneByteCharacters() throws Exception {
assertEncoded("00", 0x00); // Smallest 1-byte character.
assertEncoded("20", ' ');
assertEncoded("7e", '~');
assertEncoded("7f", 0x7f); // Largest 1-byte character.
}
@Test public void twoByteCharacters() throws Exception {
assertEncoded("c280", 0x0080); // Smallest 2-byte character.
assertEncoded("c3bf", 0x00ff);
assertEncoded("c480", 0x0100);
assertEncoded("dfbf", 0x07ff); // Largest 2-byte character.
}
@Test public void threeByteCharacters() throws Exception {
assertEncoded("e0a080", 0x0800); // Smallest 3-byte character.
assertEncoded("e0bfbf", 0x0fff);
assertEncoded("e18080", 0x1000);
assertEncoded("e1bfbf", 0x1fff);
assertEncoded("ed8080", 0xd000);
assertEncoded("ed9fbf", 0xd7ff); // Largest character lower than the min surrogate.
assertEncoded("ee8080", 0xe000); // Smallest character greater than the max surrogate.
assertEncoded("eebfbf", 0xefff);
assertEncoded("ef8080", 0xf000);
assertEncoded("efbfbf", 0xffff); // Largest 3-byte character.
}
// @Test public void fourByteCharacters() throws Exception {
// assertEncoded("f0908080", 0x010000); // Smallest surrogate pair.
// assertEncoded("f48fbfbf", 0x10ffff); // Largest code point expressible by UTF-16.
// }
//
// @Test public void danglingHighSurrogate() throws Exception {
// assertStringEncoded("3f", "\ud800"); // "?"
// }
//
// @Test public void lowSurrogateWithoutHighSurrogate() throws Exception {
// assertStringEncoded("3f", "\udc00"); // "?"
// }
//
// @Test public void highSurrogateFollowedByNonSurrogate() throws Exception {
// assertStringEncoded("3f61", "\ud800\u0061"); // "?a": Following character is too low.
// assertStringEncoded("3fee8080", "\ud800\ue000"); // "?\ue000": Following character is too high.
// }
@Test public void multipleSegmentString() throws Exception {
String a = TestUtil.repeat('a', Segment.SIZE + Segment.SIZE + 1);
Buffer encoded = new Buffer().writeUtf8(a);
Buffer expected = new Buffer().write(a.getBytes(Util.UTF_8));
assertEquals(expected, encoded);
}
@Test public void stringSpansSegments() throws Exception {
Buffer buffer = new Buffer();
String a = TestUtil.repeat('a', Segment.SIZE - 1);
String b = "bb";
String c = TestUtil.repeat('c', Segment.SIZE - 1);
buffer.writeUtf8(a);
buffer.writeUtf8(b);
buffer.writeUtf8(c);
assertEquals(a + b + c, buffer.readUtf8());
}
@Test public void readEmptyBufferThrowsEofException() throws Exception {
Buffer buffer = new Buffer();
try {
buffer.readUtf8CodePoint();
fail();
} catch (EOFException expected) {
}
}
@Test public void readLeadingContinuationByteReturnsReplacementCharacter() throws Exception {
Buffer buffer = new Buffer();
buffer.writeByte(0xbf);
assertEquals(Buffer.REPLACEMENT_CHARACTER, buffer.readUtf8CodePoint());
assertTrue(buffer.exhausted());
}
@Test public void readMissingContinuationBytesThrowsEofException() throws Exception {
Buffer buffer = new Buffer();
buffer.writeByte(0xdf);
try {
buffer.readUtf8CodePoint();
fail();
} catch (EOFException expected) {
}
assertFalse(buffer.exhausted()); // Prefix byte wasn't consumed.
}
@Test public void readTooLargeCodepointReturnsReplacementCharacter() throws Exception {
// 5-byte and 6-byte code points are not supported.
Buffer buffer = new Buffer();
buffer.write(ByteString.decodeHex("f888808080"));
assertEquals(Buffer.REPLACEMENT_CHARACTER, buffer.readUtf8CodePoint());
assertEquals(Buffer.REPLACEMENT_CHARACTER, buffer.readUtf8CodePoint());
assertEquals(Buffer.REPLACEMENT_CHARACTER, buffer.readUtf8CodePoint());
assertEquals(Buffer.REPLACEMENT_CHARACTER, buffer.readUtf8CodePoint());
assertEquals(Buffer.REPLACEMENT_CHARACTER, buffer.readUtf8CodePoint());
assertTrue(buffer.exhausted());
}
@Test public void readNonContinuationBytesReturnsReplacementCharacter() throws Exception {
// Use a non-continuation byte where a continuation byte is expected.
Buffer buffer = new Buffer();
buffer.write(ByteString.decodeHex("df20"));
assertEquals(Buffer.REPLACEMENT_CHARACTER, buffer.readUtf8CodePoint());
assertEquals(0x20, buffer.readUtf8CodePoint()); // Non-continuation character not consumed.
assertTrue(buffer.exhausted());
}
@Test public void readCodePointBeyondUnicodeMaximum() throws Exception {
// A 4-byte encoding with data above the U+10ffff Unicode maximum.
Buffer buffer = new Buffer();
buffer.write(ByteString.decodeHex("f4908080"));
assertEquals(Buffer.REPLACEMENT_CHARACTER, buffer.readUtf8CodePoint());
assertTrue(buffer.exhausted());
}
@Test public void readSurrogateCodePoint() throws Exception {
Buffer buffer = new Buffer();
buffer.write(ByteString.decodeHex("eda080"));
assertEquals(Buffer.REPLACEMENT_CHARACTER, buffer.readUtf8CodePoint());
assertTrue(buffer.exhausted());
buffer.write(ByteString.decodeHex("edbfbf"));
assertEquals(Buffer.REPLACEMENT_CHARACTER, buffer.readUtf8CodePoint());
assertTrue(buffer.exhausted());
}
@Test public void readOverlongCodePoint() throws Exception {
// Use 2 bytes to encode data that only needs 1 byte.
Buffer buffer = new Buffer();
buffer.write(ByteString.decodeHex("c080"));
assertEquals(Buffer.REPLACEMENT_CHARACTER, buffer.readUtf8CodePoint());
assertTrue(buffer.exhausted());
}
@Test public void writeSurrogateCodePoint() throws Exception {
Buffer buffer = new Buffer();
buffer.writeUtf8CodePoint(0xd7ff); // Below lowest surrogate is okay.
try {
buffer.writeUtf8CodePoint(0xd800); // Lowest surrogate throws.
fail();
} catch (IllegalArgumentException expected) {
}
try {
buffer.writeUtf8CodePoint(0xdfff); // Highest surrogate throws.
fail();
} catch (IllegalArgumentException expected) {
}
buffer.writeUtf8CodePoint(0xe000); // Above highest surrogate is okay.
}
@Test public void writeCodePointBeyondUnicodeMaximum() throws Exception {
Buffer buffer = new Buffer();
try {
buffer.writeUtf8CodePoint(0x110000);
fail();
} catch (IllegalArgumentException expected) {
}
}
private void assertEncoded(String hex, int... codePoints) throws Exception {
assertCodePointEncoded(hex, codePoints);
assertCodePointDecoded(hex, codePoints);
assertStringEncoded(hex, new String(codePoints, 0, codePoints.length));
}
private void assertCodePointEncoded(String hex, int... codePoints) throws Exception {
Buffer buffer = new Buffer();
for (int codePoint : codePoints) {
buffer.writeUtf8CodePoint(codePoint);
}
assertEquals(buffer.readByteString(), ByteString.decodeHex(hex));
}
private void assertCodePointDecoded(String hex, int... codePoints) throws Exception {
Buffer buffer = new Buffer().write(ByteString.decodeHex(hex));
for (int codePoint : codePoints) {
assertEquals(codePoint, buffer.readUtf8CodePoint());
}
assertTrue(buffer.exhausted());
}
private void assertStringEncoded(String hex, String string) throws Exception {
ByteString expectedUtf8 = ByteString.decodeHex(hex);
// Confirm our expectations are consistent with the platform.
ByteString platformUtf8 = ByteString.of(string.getBytes("UTF-8"));
assertEquals(expectedUtf8, platformUtf8);
// Confirm our implementation matches those expectations.
ByteString actualUtf8 = new Buffer().writeUtf8(string).readByteString();
assertEquals(expectedUtf8, actualUtf8);
}
}